diff --git a/.gitignore b/.gitignore
index c304f0c6..ab70c188 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
+*~
 *.o
 *.elf
 *.bin
diff --git a/test/uti/CT01.c b/test/uti/CT01.c
new file mode 100644
index 00000000..5a209b47
--- /dev/null
+++ b/test/uti/CT01.c
@@ -0,0 +1,137 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex1;
+pthread_cond_t cond1;
+pthread_mutex_t mutex2;
+pthread_cond_t cond2;
+char *m;
+int flag1, flag2;
+
+int sigst;
+pthread_t thr;
+
+void
+sigsegv(int s)
+{
+	if (sigst == 1) {
+		fprintf(stderr, "CT01007 munmap OK (SIGSEGV)\n");
+		pthread_join(thr, NULL);
+		fprintf(stderr, "CT01008 exit(pthread_join) OK\n");
+		fprintf(stderr, "CT01009 futex (pthread_mutex/pthread_cond) OK\n");
+		fprintf(stderr, "CT01010 END\n");
+		exit(0);
+	}
+	printf("BAD SIGSEGV\n");
+	exit(1);
+}
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT01003 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT01003 running on McKernel NG\n", rc);
+		exit(1);
+	}
+	errno = 0;
+	m = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	if (m != (void *)-1) {
+		fprintf(stderr, "CT01004 mmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT01004 mmap NG errno=%d\n", errno);
+		exit(1);
+	}
+	strcpy(m, "mmap OK");
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while(!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+	rc = munmap(m, 4096);
+	if (rc == 0) {
+		fprintf(stderr, "CT01006 munmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT01006 munmap NG errno=%d\n", errno);
+		exit(1);
+	}
+
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	signal(SIGSEGV, sigsegv);
+	pthread_mutex_init(&mutex1, NULL);
+	pthread_cond_init(&cond1, NULL);
+	pthread_mutex_init(&mutex2, NULL);
+	pthread_cond_init(&cond2, NULL);
+
+	fprintf(stderr, "CT01001 mmap/munmap/futex/exit START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT01002 pthread_create OK\n");
+	pthread_mutex_lock(&mutex1);
+	while(!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+
+	fprintf(stderr, "CT01005 %s\n", m);
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+
+	pthread_mutex_lock(&mutex1);
+	while(!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;	
+	pthread_mutex_unlock(&mutex1);
+
+	sigst = 1;
+	fprintf(stderr, "%s\n", m);
+	fprintf(stderr, "CT01007 munmap NG\n");
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT01008 exit(pthread_join) OK\n");
+	fprintf(stderr, "CT01009 futex (pthread_mutex/pthread_cond) OK\n");
+	fprintf(stderr, "CT01010 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT01.sh b/test/uti/CT01.sh
new file mode 100755
index 00000000..7756e8cb
--- /dev/null
+++ b/test/uti/CT01.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=$HOME
+UTI_TOP=${MYHOME}/project/os/mckernel/test/uti
+
+MCK=${MYHOME}/project/os/install
+unset DISABLE_UTI
+
+cmdline="./CT01"
+
+stop=0
+reboot=0
+go=0
+
+mck=0
+nloops=1
+
+while getopts srgac:n:mdl: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=1
+		;;
+	    c) cmdline=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) 
+		mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+	    l) nloops=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+else
+    MCEXEC=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    sudo ${MCK}/sbin/mcreboot.sh -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_TOP}
+    make $cmdline
+    for i in `seq 1 ${nloops}`; do
+	${MCK}/bin/mcexec --enable-uti $cmdline
+	wait
+	echo =====;
+	echo $i;
+	echo =====; i=$((i+1));
+    done
+fi
+
+
+
diff --git a/test/uti/CT02.c b/test/uti/CT02.c
new file mode 100644
index 00000000..fdfe51d2
--- /dev/null
+++ b/test/uti/CT02.c
@@ -0,0 +1,162 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+int flag1;
+pthread_mutex_t mutex1;
+pthread_cond_t cond1;
+
+int flag2;
+pthread_mutex_t mutex2;
+pthread_cond_t cond2;
+char *m;
+
+int sigst;
+pthread_t thr;
+
+void
+sigsegv(int s)
+{
+	if (sigst == 1) {
+		fprintf(stderr, "CT02007 mremap OK (SIGSEGV)\n");
+		pthread_mutex_lock(&mutex2);
+		flag2 = 1;
+		pthread_cond_signal(&cond2);
+		pthread_mutex_unlock(&mutex2);
+		pthread_join(thr, NULL);
+		fprintf(stderr, "CT02009 pthread_join OK\n");
+		fprintf(stderr, "CT02010 END\n");
+		exit(0);
+	}
+	printf("BAD SIGSEGV\n");
+	exit(1);
+}
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+	char *n;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT02003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT02003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+	errno = 0;
+	m = mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	if (m != (void *)-1) {
+		fprintf(stderr, "CT02004 mmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT02004 mmap NG errno=%d\n", errno);
+		exit(1);
+	}
+	strcpy(m + 4096, "mmap OK");
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+	pthread_mutex_lock(&mutex2);
+	while (!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+	n = mremap(m, 8192, 4096, 0);
+	if (n == m) {
+		fprintf(stderr, "CT02006 mremap OK\n");
+	}
+	else if (n != (void *)-1){
+		fprintf(stderr, "CT02006 mremap remapped, test stop\n");
+		exit(1);
+	}
+	else {
+		fprintf(stderr, "CT02006 mremap NG errno=%d\n", errno);
+		exit(1);
+	}
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while (!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+	rc = munmap(m, 4096);
+	if (rc == 0) {
+		fprintf(stderr, "CT02008 munmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT02008 munmap NG errno=%d\n", errno);
+		exit(1);
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	signal(SIGSEGV, sigsegv);
+	pthread_mutex_init(&mutex1, NULL);
+	pthread_cond_init(&cond1, NULL);
+	pthread_mutex_init(&mutex2, NULL);
+	pthread_cond_init(&cond2, NULL);
+
+	fprintf(stderr, "CT02001 mremap START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT02002 pthread_create OK\n");
+	pthread_mutex_lock(&mutex1);
+	while (!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+	fprintf(stderr, "CT02005 %s\n", m + 4096);
+
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+
+	pthread_mutex_lock(&mutex1);
+	while (!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+	sigst = 1;
+	fprintf(stderr, "%s\n", m + 4096);
+	fprintf(stderr, "CT02007 mremap NG\n");
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT02009 pthread_join OK\n");
+	fprintf(stderr, "CT02010 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT03.c b/test/uti/CT03.c
new file mode 100644
index 00000000..6f79b2d5
--- /dev/null
+++ b/test/uti/CT03.c
@@ -0,0 +1,171 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+int flag1;
+pthread_mutex_t mutex1;
+pthread_cond_t cond1;
+
+int flag2;
+pthread_mutex_t mutex2;
+pthread_cond_t cond2;
+
+char *m;
+
+int sigst;
+pthread_t thr;
+
+void
+sigsegv(int s)
+{
+	if (sigst == 1) {
+		fprintf(stderr, "CT03007 mprotect OK (SIGSEGV)\n");
+
+		pthread_mutex_lock(&mutex2);
+		flag2 = 1;
+		pthread_cond_signal(&cond2);
+		pthread_mutex_unlock(&mutex2);
+
+		pthread_join(thr, NULL);
+		fprintf(stderr, "CT03009 pthread_join OK\n");
+		fprintf(stderr, "CT03010 END\n");
+		exit(0);
+	}
+	printf("BAD SIGSEGV\n");
+	exit(1);
+}
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT03003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT03003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+	errno = 0;
+	m = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+	if (m != (void *)-1) {
+		fprintf(stderr, "CT03004 mmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT03004 mmap NG errno=%d\n", errno);
+		exit(1);
+	}
+	strcpy(m, "mmap OK");
+
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while (!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+
+	rc = mprotect(m, 4096, PROT_READ);
+	if (rc == 0) {
+		fprintf(stderr, "CT03006 mprotect OK\n");
+	}
+	else {
+		fprintf(stderr, "CT03006 mprotect NG errno=%d\n", errno);
+		exit(1);
+	}
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while (!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+
+	rc = munmap(m, 4096);
+	if (rc == 0) {
+		fprintf(stderr, "CT03008 munmap OK\n");
+	}
+	else {
+		fprintf(stderr, "CT03008 munmap NG errno=%d\n", errno);
+		exit(1);
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	signal(SIGSEGV, sigsegv);
+	pthread_mutex_init(&mutex1, NULL);
+	pthread_cond_init(&cond1, NULL);
+	pthread_mutex_init(&mutex2, NULL);
+	pthread_cond_init(&cond2, NULL);
+
+	fprintf(stderr, "CT03001 mprotect START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT03002 pthread_create OK\n");
+
+	pthread_mutex_lock(&mutex1);
+	while (!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+
+	fprintf(stderr, "CT03005 %s\n", m);
+
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+	
+
+	pthread_mutex_lock(&mutex1);
+	while (!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+
+	sigst = 1;
+	strcpy(m, "mprotect NG");
+	fprintf(stderr, "%s\n", m);
+	fprintf(stderr, "CT03007 mprotect NG\n");
+
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT03009 pthread_join OK\n");
+	fprintf(stderr, "CT03010 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT04.c b/test/uti/CT04.c
new file mode 100644
index 00000000..7ecd17b0
--- /dev/null
+++ b/test/uti/CT04.c
@@ -0,0 +1,106 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+int flag1;
+pthread_mutex_t mutex1;
+pthread_cond_t cond1;
+
+int flag2;
+pthread_mutex_t mutex2;
+pthread_cond_t cond2;
+
+char *a;
+char *b;
+char *c;
+
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT04003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT04003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+	errno = 0;
+	a = sbrk(0);
+	fprintf(stderr, "CT04004 sbrk OK\n");
+	b = sbrk(4096);
+	strcpy(a, "sbrk OK");
+
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while(!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+
+	b = sbrk(0);
+	if (c == b) {
+		fprintf(stderr, "CT04006 sbrk OK\n");
+	}
+	else {
+		fprintf(stderr, "CT04006 sbrk NG %p != %p\n", c, b);
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	pthread_t thr;
+	int rc;
+
+	pthread_mutex_init(&mutex1, NULL);
+	pthread_cond_init(&cond1, NULL);
+	pthread_mutex_init(&mutex2, NULL);
+	pthread_cond_init(&cond2, NULL);
+
+	fprintf(stderr, "CT04001 brk START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT04002 pthread_create OK\n");
+
+	pthread_mutex_lock(&mutex1);
+	while(!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+	fprintf(stderr, "CT04005 %s\n", a);
+
+	c = sbrk(0);
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT04007 pthread_join OK\n");
+	fprintf(stderr, "CT04008 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT05.c b/test/uti/CT05.c
new file mode 100644
index 00000000..ad5d4918
--- /dev/null
+++ b/test/uti/CT05.c
@@ -0,0 +1,67 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+	int tid;
+
+	rc = syscall(732);
+	if (rc == 0)
+		fprintf(stderr, "CT05003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT05003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+	tid = syscall(SYS_gettid);
+	fprintf(stderr, "CT05004 gettid OK %d\n", tid);
+	rc = syscall(730);
+	if (rc == 0) {
+		fprintf(stderr, "CT05005 util_migrate_inter_kernel OK\n");
+	}
+	else {
+		fprintf(stderr, "CT05005 util_migrate_inter_kernel NG rc=%d errno=%d\n", rc, errno);
+	}
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT05006 get_system OK\n");
+	else {
+		fprintf(stderr, "CT05006 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+	if ((rc = syscall(SYS_gettid)) == tid) {
+		fprintf(stderr, "CT05007 gettid OK %d\n", tid);
+	}
+	else {
+		fprintf(stderr, "CT05007 gettid NG %d\n", rc);
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	pthread_t thr;
+	int rc;
+
+	fprintf(stderr, "CT05001 gettid START\n");
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT05002 pthread_create OK\n");
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT05008 pthread_join OK\n");
+	fprintf(stderr, "CT05009 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT06.c b/test/uti/CT06.c
new file mode 100644
index 00000000..61d1d238
--- /dev/null
+++ b/test/uti/CT06.c
@@ -0,0 +1,79 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+void *
+util_thread(void *arg)
+{
+	long rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT06003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT06003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+
+	syscall(SYS_exit_group, 99);
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+	pthread_t thr;
+	int st;
+	pid_t pid;
+
+	fprintf(stderr, "CT06001 syscall error START\n");
+
+	pid = fork();
+	if (pid) {
+		if (pid == -1) {
+			perror("fork");
+			exit(1);
+		}
+		while ((rc = waitpid(pid, &st, 0)) == -1 && errno == EINTR);
+		if (rc == -1) {
+			fprintf(stderr, "CT06004 exit_group NG rc=%d errno=%d\n", rc, errno);
+			exit(1);
+		}
+		if (!WIFEXITED(st)) {
+			fprintf(stderr, "CT06004 exit_group NG st=%08x\n", st);
+			exit(1);
+		}
+		if (WEXITSTATUS(st) != 99) {
+			fprintf(stderr, "CT06004 exit_group NG st=%d\n", WEXITSTATUS(st));
+			exit(1);
+		}
+		fprintf(stderr, "CT06004 exit_group OK\n");
+		exit(0);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT06002 pthread_create OK\n");
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT06004 pthread_join NG\n");
+	fprintf(stderr, "CT06004 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT07.c b/test/uti/CT07.c
new file mode 100644
index 00000000..9eff04ca
--- /dev/null
+++ b/test/uti/CT07.c
@@ -0,0 +1,86 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <sys/types.h>
+
+void *
+util_thread(void *arg)
+{
+	long rc;
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT07003 get_system OK\n");
+	else {
+		fprintf(stderr, "CT07003 get_system NG get_system=%d\n", rc);
+		exit(1);
+	}
+
+	rc = syscall(SYS_clone);
+	if (rc == -1 && errno == ENOSYS) {
+		fprintf(stderr, "CT07004 clone OK\n");
+	}
+	else {
+		fprintf(stderr, "CT07004 clone NG rc=%ld errno=%d\n", rc, errno);
+	}
+
+	rc = syscall(SYS_fork);
+	if (rc == -1 && errno == ENOSYS) {
+		fprintf(stderr, "CT07005 fork OK\n");
+	}
+	else {
+		fprintf(stderr, "CT07005 fork NG rc=%ld errno=%d\n", rc, errno);
+	}
+
+#if 0 /* It looks like syscall_intercept can't hook vfork */
+	rc = syscall(SYS_vfork);
+	//rc = vfork();
+	fprintf(stderr, "CT07006 vfork rc=%d,errno=%d\n", rc, errno);
+	if (rc == -1 && errno == ENOSYS) {
+		fprintf(stderr, "CT07006 vfork OK\n");
+	}
+	else {
+		fprintf(stderr, "CT07006 vfork NG rc=%ld errno=%d\n", rc, errno);
+	}
+#endif
+
+	rc = syscall(SYS_execve);
+	if (rc == -1 && errno == ENOSYS) {
+		fprintf(stderr, "CT07007 execve OK\n");
+	}
+	else {
+		fprintf(stderr, "CT07007 execve NG rc=%ld errno=%d\n", rc, errno);
+	}
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+	pthread_t thr;
+
+	fprintf(stderr, "CT07001 syscall error START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT07002 pthread_create OK\n");
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT07008 pthread_join OK\n");
+	fprintf(stderr, "CT07010 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT08.c b/test/uti/CT08.c
new file mode 100644
index 00000000..8ce75e2e
--- /dev/null
+++ b/test/uti/CT08.c
@@ -0,0 +1,165 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+
+#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */
+
+#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2)
+#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3)
+
+#define UTI_FLAG_SAME_L1 (1ULL<<4)
+#define UTI_FLAG_SAME_L2 (1ULL<<5)
+#define UTI_FLAG_SAME_L3 (1ULL<<6)
+
+#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7)
+#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8)
+#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9)
+
+#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10)
+#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11)
+#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12)
+#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13)
+
+/* Linux default value is used */
+#define UTI_MAX_NUMA_DOMAINS (1024)
+
+typedef struct uti_attr {
+        /* UTI_CPU_SET environmental variable is used to denote the preferred
+           location of utility thread */
+        uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) /
+                          (sizeof(uint64_t) * 8)];
+        uint64_t flags; /* Representing location and behavior hints by bitmap */
+} uti_attr_t;
+
+void
+print_sched()
+{
+	cpu_set_t cpuset;
+	int sched;
+
+	sched_getaffinity(0, sizeof cpuset, &cpuset);
+	sched = sched_getscheduler(0);
+	fprintf(stderr, "\tsched cpu=%16lx sched=%d\n", *(long *)&cpuset, sched);
+}
+
+void *
+util_thread(void *arg)
+{
+	print_sched();
+	return NULL;
+}
+
+void
+thread_test(uti_attr_t *attr, char *msg)
+{
+	pthread_t thr;
+	int rc;
+
+	fprintf(stderr, "%s\n", msg);
+	rc = syscall(731, 1, attr);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	pthread_join(thr, NULL);
+}
+
+int
+main(int argc, char **argv)
+{
+	uti_attr_t attr;
+
+	memset(&attr, '\0', sizeof attr);
+	attr.numa_set[0] = 2; // NUMA domain == 1
+	attr.flags = UTI_FLAG_NUMA_SET;
+	thread_test(&attr, "CT08001 UTI_FLAG_NUMA_SET");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.numa_set[0] = 2;
+	attr.flags = UTI_FLAG_NUMA_SET | UTI_FLAG_EXCLUSIVE_CPU;
+	thread_test(&attr, "CT08002 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.numa_set[0] = 2;
+	attr.flags = UTI_FLAG_NUMA_SET | UTI_FLAG_EXCLUSIVE_CPU;
+	thread_test(&attr, "CT08003 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU(2)");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_NUMA_DOMAIN;
+	thread_test(&attr, "CT08004 UTI_FLAG_SAME_NUMA_DOMAIN");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_NUMA_DOMAIN | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08005 UTI_FLAG_SAME_NUMA_DOMAIN|UTI_FLAG_CPU_INTENSIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_NUMA_DOMAIN;
+	thread_test(&attr, "CT08006 UTI_FLAG_DIFFERENT_NUMA_DOMAIN");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_NUMA_DOMAIN | UTI_FLAG_HIGH_PRIORITY;
+	thread_test(&attr, "CT08007 UTI_FLAG_DIFFERENT_NUMA_DOMAIN|UTI_FLAG_HIGH_PRIORITY");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L1;
+	thread_test(&attr, "CT08008 UTI_FLAG_SAME_L1");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L1 | UTI_FLAG_NON_COOPERATIVE;
+	thread_test(&attr, "CT08009 UTI_FLAG_SAME_L1|UTI_FLAG_NON_COOPERATIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L2;
+	thread_test(&attr, "CT08010 UTI_FLAG_SAME_L2");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L2 | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08011 UTI_FLAG_SAME_L2|UTI_FLAG_CPU_INTENSIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L3;
+	thread_test(&attr, "CT08012 UTI_FLAG_SAME_L3");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_SAME_L3 | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08013 UTI_FLAG_SAME_L3|UTI_FLAG_CPU_INTENSIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L1;
+	thread_test(&attr, "CT08014 UTI_FLAG_DIFFERENT_L1");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L1 | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08015 UTI_FLAG_DIFFERENT_L1|UTI_FLAG_CPU_INTENSIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L2;
+	thread_test(&attr, "CT08016 UTI_FLAG_DIFFERENT_L2");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L2 | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08017 UTI_FLAG_DIFFERENT_L2|UTI_FLAG_CPU_INTENSIVE");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L3;
+	thread_test(&attr, "CT08018 UTI_FLAG_DIFFERENT_L3");
+
+	memset(&attr, '\0', sizeof attr);
+	attr.flags = UTI_FLAG_DIFFERENT_L3 | UTI_FLAG_CPU_INTENSIVE;
+	thread_test(&attr, "CT08019 UTI_FLAG_DIFFERENT_L3|UTI_FLAG_CPU_INTENSIVE");
+
+	exit(0);
+}
diff --git a/test/uti/CT09.c b/test/uti/CT09.c
new file mode 100644
index 00000000..b8bed45b
--- /dev/null
+++ b/test/uti/CT09.c
@@ -0,0 +1,278 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (98600)  /* 98.6 usec */
+#define RTS_DELAY   (1000)  /*  1   usec, CPU time for sending Request-to-Send packet */
+#define NIC_DELAY   (3000)  /*  5   usec, RTS packet propagation time + RDMA-read on the responder side + CPU time for sending DONE packet + DONE packet network propagation time */
+#define POLL_DELAY  ( 200) /*  0.2 usec, CPU time for checking DRAM event queue */
+#define COMPL_DELAY ( 200) /*  0.2 usec, CPU time for updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+#if 0
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"
+				 "movq %%rax, %0\n\t"
+				 : "+rm" (*ptr)
+				 :
+				 : "rax", "cc", "memory");
+#endif
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+
+struct thr_arg {
+	int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args;
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(long delay_nsec, unsigned long* mem) {
+	if (delay_nsec < 0) {
+		printf("%s: delay_nsec<0\n", __FUNCTION__);
+	}
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *progress_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int rc;
+	int spin_count = 0;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+	}
+
+	printf("tid=%d,bar_count=%d\n", syscall(__NR_gettid), arg->bar_count);
+
+	pthread_mutex_lock(&arg->bar_lock);
+	arg->bar_count++;
+	if (arg->bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&arg->bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (arg->bar_count != 2) {
+		if ((rc = pthread_cond_wait(&arg->bar_cond, &arg->bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+	}
+	pthread_mutex_unlock(&arg->bar_lock);
+	
+	printf("after barrier\n");
+
+	/* Start progress */
+	pthread_mutex_lock(&ep_lock);
+	while(1) {
+		if (terminate) {
+			break;
+		}
+
+		fwq(POLL_DELAY, &arg->mem);
+		
+		/* Event found */
+		if (nevents > 0) {
+			fwq(COMPL_DELAY, &arg->mem); /* Simulate MPI protocol response */
+			nevents = 0;
+		}
+
+		spin_count++;
+		if (spin_count >= NSPIN) {
+			spin_count = 0;
+			pthread_mutex_unlock(&ep_lock);
+			sched_yield();
+			pthread_mutex_lock(&ep_lock);
+		}
+	}
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	char *uti_str;
+	int uti_val;
+	struct timespec start, end;
+	int disable_progress;
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init(&mem);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	thr_args.bar_count = 0;
+	pthread_cond_init(&thr_args.bar_cond, NULL);
+	pthread_mutex_init(&thr_args.bar_lock, NULL);
+
+	disable_progress = (argc > 1 && strcmp(argv[1], "-d") == 0) ? 1 : 0;
+
+	if (disable_progress) {
+		goto skip1;
+	}
+
+	uti_str = getenv("DISABLE_UTI");
+	uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n", rc);
+	}
+
+	rc = pthread_create(&thr_args.pthread, NULL, progress_fn, &thr_args);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	pthread_mutex_lock(&thr_args.bar_lock);
+	thr_args.bar_count++;
+	if (thr_args.bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_args.bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_args.bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_args.bar_cond, &thr_args.bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+	}
+	pthread_mutex_unlock(&thr_args.bar_lock);
+	
+	fprintf(stdout, "CT09004 pthread_create OK\n");
+ skip1:
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < 10000; i++) { /* It takes 1 sec */
+		if(!disable_progress) {
+
+			/* Acquire endpoint and send request-to-send packet */
+			pthread_mutex_lock(&ep_lock);
+			fwq(RTS_DELAY, &mem); 
+			pthread_mutex_unlock(&ep_lock);
+
+			/* Start calculation */
+
+			/* Generate event on behaf of responder */
+			fwq(NIC_DELAY, &mem); 
+			nevents++;
+
+			fwq(CALC_DELAY - NIC_DELAY, &mem); /* Overlap remainder */
+
+			/* Wait until async thread consumes the event */
+			while (nevents > 0) {
+				FIXED_SIZE_WORK(&mem);
+			}
+		} else {
+			/* No overlap case */
+			fwq(RTS_DELAY + CALC_DELAY + POLL_DELAY + COMPL_DELAY, &mem);
+		}
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	if(!disable_progress) {
+		terminate = 1;
+		
+		pthread_join(thr_args.pthread, NULL);
+	}
+	fprintf(stderr, "total %ld nsec\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	fprintf(stdout, "CT09006 END\n");
+
+
+	exit(0);
+}
diff --git a/test/uti/CT09.sh b/test/uti/CT09.sh
new file mode 100755
index 00000000..b28b6d5e
--- /dev/null
+++ b/test/uti/CT09.sh
@@ -0,0 +1,51 @@
+#!/usr/bin/bash
+MYHOME="/work/gg10/e29005"
+MCK="${MYHOME}/project/os/install"
+MCEXEC=
+export DISABLE_UTI=0
+
+stop=0
+reset=0
+go=0
+nodes="c[8194]"
+
+while getopts srgmd OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reset=1
+                ;;
+            g) go=1
+                ;;
+            m) MCEXEC="${MCK}/bin/mcexec"
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcstop+release.sh
+fi
+
+if [ ${reset} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1
+fi
+
+if [ ${go} -eq 1 ]; then
+    > ./log
+    for i in {1..10}; do (${MCEXEC} --enable-uti ./CT09 1>/dev/null 2>> ./log); done
+    #${MCEXEC} ./CT09
+    perl CT11.pl < ./log
+fi
diff --git a/test/uti/CT10.c b/test/uti/CT10.c
new file mode 100644
index 00000000..763e8a53
--- /dev/null
+++ b/test/uti/CT10.c
@@ -0,0 +1,103 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex1;
+pthread_cond_t cond1;
+pthread_mutex_t mutex2;
+pthread_cond_t cond2;
+char *m;
+int flag1, flag2;
+
+int sigst;
+pthread_t thr;
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT10100 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT10100 running on Linux NG (%d)\n", rc);
+	}
+	errno = 0;
+
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	while(!flag2) {
+		pthread_cond_wait(&cond2, &mutex2);
+	}
+	flag2 = 0;
+	pthread_mutex_unlock(&mutex2);
+
+	pthread_mutex_lock(&mutex1);
+	flag1 = 1;
+	pthread_cond_signal(&cond1);
+	pthread_mutex_unlock(&mutex1);
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	pthread_mutex_init(&mutex1, NULL);
+	pthread_cond_init(&cond1, NULL);
+	pthread_mutex_init(&mutex2, NULL);
+	pthread_cond_init(&cond2, NULL);
+
+	fprintf(stderr, "CT10001 futex START\n");
+#if 1
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+#endif
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT10002 pthread_create OK\n");
+
+	pthread_mutex_lock(&mutex1);
+	while(!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_mutex_lock(&mutex2);
+	flag2 = 1;
+	pthread_cond_signal(&cond2);
+	pthread_mutex_unlock(&mutex2);
+
+	pthread_mutex_lock(&mutex1);
+	while(!flag1) {
+		pthread_cond_wait(&cond1, &mutex1);
+	}
+	flag1 = 0;
+	pthread_mutex_unlock(&mutex1);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT10003 pthread_join OK\n");
+
+	fprintf(stderr, "CT10004 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT11.c b/test/uti/CT11.c
new file mode 100644
index 00000000..04d77e3e
--- /dev/null
+++ b/test/uti/CT11.c
@@ -0,0 +1,275 @@
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <string.h>
+#include <pthread.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <linux/futex.h>
+
+#define NLOOP 10
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define SZCHUNK 4096
+#define IHK_DEVICE_CREATE_OS          0x112900
+#define IHK_DEVICE_DESTROY_OS         0x112901
+
+pthread_mutex_t mutex;
+pthread_cond_t cond;
+int sem;
+int futex_flag;
+pthread_t thr;
+
+struct syscall {
+	int number;
+	const char *name;
+};
+
+struct syscall syscalls[] = {
+	{ .number = __NR_getuid, .name = "getuid" },
+	{ .number = __NR_ioctl, .name = "ioctl" },
+	{ .number = __NR_futex, .name = "futex" },
+	{ .number = __NR_mmap, .name = "mmap" },
+	{ .number = __NR_munmap, .name = "munmap" },
+	{ .number = __NR_brk, .name = "brk" },
+	{ .number = __NR_gettid, .name = "gettid" },
+	{ .number = __NR_mprotect, .name = "mprotect" },
+	{ .number = __NR_mremap, .name = "mremap" },
+	{ .number = __NR_open, .name = "open" },
+	{ .number = __NR_read, .name = "read" },
+	{ .number = __NR_write, .name = "write" }
+};
+
+void *util_thread(void *arg) {
+	int i, j;
+	int rc;
+	uid_t uid;
+	int osnum;
+	int fds[NLOOP];
+	void *mems[NLOOP];
+	void *memremaps[NLOOP];
+	void *brk_cur;
+	char* buf = malloc(SZCHUNK*NLOOP);
+	struct timespec start, end;
+	long nsec;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "[INFO] Child is running on Liux\n");
+	else {
+		fprintf(stdout, "[INFO] Child is running on McKernel\n");
+	}
+	errno = 0;
+
+	for (i = 0; i < sizeof(syscalls) / sizeof(syscalls[0]); i++) { 
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+		switch (syscalls[i].number) {
+		case __NR_brk:
+			brk_cur = sbrk(0);
+			break;
+		case __NR_mprotect:
+			if((mems[0] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) {
+				fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+			}
+		case __NR_munmap:
+		case __NR_mremap:
+			for (j = 0; j < NLOOP; j++) {
+				if((mems[j] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) {
+					fprintf(stderr, "mmap failed: %s\n", strerror(errno));
+				}
+			}
+			break;
+		case __NR_ioctl:
+			if((fds[0] = open("/dev/hello", O_RDWR)) < 0) {
+				fprintf(stderr, "ioctl, open failed: %s\n", strerror(errno));
+				exit(1);
+			}
+			break;
+		case __NR_read:
+		case __NR_write:
+			if((fds[0] = open("./file", O_RDWR)) < 0) {
+				fprintf(stderr, "write, open failed: %s\n", strerror(errno));
+				exit(1);
+			}
+			break;
+		default:
+			break;
+		}
+
+		for (j = 0; j < NLOOP; j++) {
+			switch (syscalls[i].number) {
+			case __NR_gettid:
+				if((rc = syscall(syscalls[i].number)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_futex: 
+				futex_flag = 1;
+				if((rc = syscall(__NR_futex, &futex_flag, FUTEX_WAKE, 1, NULL, NULL, 0)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_brk:
+				if((rc = brk(brk_cur)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_mmap:
+				if((mems[j] = mmap(0, SZCHUNK, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) == (void*)-1) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_munmap:
+				if((rc = munmap(mems[j], SZCHUNK)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_mprotect:
+				if((rc = mprotect(mems[0], SZCHUNK, PROT_READ)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_mremap:
+				if((memremaps[j] = mremap(mems[j], SZCHUNK, 8192, MREMAP_MAYMOVE)) == (void*)-1) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_getuid:
+				if((uid = syscall(syscalls[i].number)) < 0) {
+					fprintf(stderr, "%s failed: uid=%d,%s\n", syscalls[i].name, uid, strerror(errno));
+				}
+				break;
+			case __NR_open:
+				if((fds[j] = open("./file", O_RDONLY)) < 0) {
+					fprintf(stderr, "%s ./file failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_ioctl:
+				if((rc = syscall(syscalls[i].number, fds[0], 0, 0)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_read:
+				if((rc = read(fds[0], buf + j * SZCHUNK, SZCHUNK)) < 0) {
+					fprintf(stderr, "%s failed: %s\n", syscalls[i].name, strerror(errno));
+				}
+				break;
+			case __NR_write:
+				if((rc = write(fds[0], buf + j * SZCHUNK, SZCHUNK)) < 0) {
+					fprintf(stderr, "%s failed: rc=%d,%s\n", syscalls[i].name, rc, strerror(errno));
+				}
+				break;
+			}
+		}
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+		fprintf(stderr, "%s %ld nsec\n", syscalls[i].name, nsec / NLOOP);
+
+		switch (syscalls[i].number) {
+		case __NR_mmap:
+			for (j = 0; j < NLOOP; j++) {
+				if((rc = munmap(mems[j], SZCHUNK)) < 0) {
+					fprintf(stderr, "munmap failed: %s\n", strerror(errno));
+				}
+			}
+			break;
+		case __NR_mprotect:
+			if((rc = munmap(mems[0], SZCHUNK)) < 0) {
+				fprintf(stderr, "munmap failed: %s\n", strerror(errno));
+			}
+			break;
+		case __NR_mremap:
+			for (j = 0; j < NLOOP; j++) {
+				if((rc = munmap(memremaps[j], SZCHUNK)) < 0) {
+					fprintf(stderr, "munmap failed: %s\n", strerror(errno));
+				}
+			}
+			break;
+		case __NR_open:
+			for (j = 0; j < NLOOP; j++) {
+				if((rc = close(fds[j])) < 0) {
+					fprintf(stderr, "close failed: %s\n", strerror(errno));
+				}
+			}
+			break;
+		case __NR_ioctl:
+		case __NR_read:
+		case __NR_write:
+			if((rc = close(fds[0])) < 0) {
+				fprintf(stderr, "close failed: %s\n", strerror(errno));
+			}
+			break;
+		default:
+			break;
+		}
+	}
+
+	pthread_mutex_lock(&mutex);
+	while (!sem) {
+		pthread_cond_wait(&cond, &mutex);
+	}
+	sem = 0;
+	pthread_mutex_unlock(&mutex);
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+	char *uti_str;
+	int disable_syscall_intercept = 0;
+	int opt;
+
+	while ((opt = getopt(argc, argv, "+I:")) != -1) {
+		switch (opt) {
+		case 'I':
+			disable_syscall_intercept = atoi(optarg);
+			break;
+		default: /* '?' */
+			printf("unknown option %c\n", optopt);
+			exit(1);
+		}
+	}
+
+	if (disable_syscall_intercept == 0) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT11002 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT11002 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT11002 INFO: uti disabled\n", rc);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc) {
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stdout, "CT11003 pthread_create OK\n");
+
+	while (!futex_flag) {
+		rc = syscall(__NR_futex, &futex_flag, FUTEX_WAIT, 0, NULL, NULL, 0);
+		if (rc == -1) {
+			fprintf(stderr, "CT11101 FUTEX_WAIT ERROR: %s\n", strerror(errno));
+		}
+	}
+
+	pthread_mutex_lock(&mutex);
+	sem = 1;
+	pthread_cond_signal(&cond);
+	pthread_mutex_unlock(&mutex);
+	pthread_join(thr, NULL);
+
+	fprintf(stdout, "CT10005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT11.pl b/test/uti/CT11.pl
new file mode 100755
index 00000000..8e64f98e
--- /dev/null
+++ b/test/uti/CT11.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl
+
+while(<>) {
+#    print $_;
+    @row = split(/\s+/, $_);
+#    print $row[0]."\n";
+    $nsec{$row[0]} += $row[1];
+    $count{$row[0]}++;
+    if ($bitmap{$row[0]} == "") {
+	push @names, ($row[0]);
+    }
+    $bitmap{$row[0]} = 1;
+}
+
+foreach $name (@names) {
+    print $name . ',' . $nsec{$name} / $count{$name} . "\n";
+}
diff --git a/test/uti/CT11.sh b/test/uti/CT11.sh
new file mode 100755
index 00000000..80a48625
--- /dev/null
+++ b/test/uti/CT11.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/bash
+
+MYHOME=$HOME
+
+MCK="${MYHOME}/project/os/install"
+
+stop=0
+reset=0
+go=0
+measure=0
+
+mck=0
+disable_syscall_intercept=0
+nloops=1
+
+while getopts srgmI:l:M OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reset=1
+                ;;
+            g) go=1
+                ;;
+            m) mck=1
+                ;;
+            I) disable_syscall_intercept=$OPTARG
+                ;;
+	    l) nloops=$OPTARG
+		;;
+	    M) measure=1
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    if [ ${disable_syscall_intercept} -eq 0 ]; then
+	mcexecopt="--enable-uti"
+    else
+	mcexecopt=
+    fi
+else
+    MCEXEC=
+    mcexecopt=
+fi
+
+if [ ${stop} -eq 1 ]; then
+#    sudo mount /work
+
+    sudo ${MCK}/sbin/mcstop+release.sh
+fi
+
+if [ ${reset} -eq 1 ]; then
+    if hostname  | grep ofp &>/dev/null; then
+	#sudo mount /work
+	:
+    fi
+  
+    if hostname  | grep ofp &>/dev/null; then
+	sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+    elif hostname | grep koala &>/dev/null; then
+	sudo ${MCK}/sbin/mcreboot.sh -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+    else
+	sudo ${MCK}/sbin/mcreboot.sh -c 1,2 -m 512M
+    fi
+fi
+
+function init_mod() {
+    if grep hello /proc/devices > /dev/null; then
+	sudo rm -f /dev/hello
+	sudo rmmod "driver/hello.ko"
+    fi
+
+    (cd driver; make)
+
+    if ! grep hello /proc/devices > /dev/null; then
+	sudo insmod "driver/hello.ko"
+	major=`grep hello /proc/devices | cut -d' ' -f 1`
+	sudo mknod /dev/hello c $major 0
+	sudo chmod og+rw /dev/hello
+    fi
+}
+
+if [ ${measure} -eq 1 ]; then
+    init_mod
+
+    rm -f ./CT11
+    make ./CT11
+
+    > ./log
+    for i in {1..10}; do (${MCEXEC} $mcexecopt ./CT11 -I $disable_syscall_intercept 1>/dev/null 2>> ./log); done
+    perl CT11.pl < ./log
+fi
+
+if [ ${go} -eq 1 ]; then
+    init_mod
+
+    rm -f ./CT11
+    make ./CT11
+
+    for i in `seq 1 ${nloops}`; do
+	${MCEXEC} $mcexecopt ./CT11 -I $disable_syscall_intercept
+	echo =====;
+	echo $i;
+	echo =====; i=$((i+1));
+    done
+fi
diff --git a/test/uti/CT12.c b/test/uti/CT12.c
new file mode 100644
index 00000000..b630b902
--- /dev/null
+++ b/test/uti/CT12.c
@@ -0,0 +1,118 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+
+int passed = 0, sem = 0;
+pthread_t thr;
+
+unsigned long mem; /* delay functions issue ld/st instructions on this address */
+double nspw; /* nsec per work */
+
+/* Timer related macros */
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define N_INIT 10000000
+
+static inline void fixed_size_work(unsigned long *ptr) {
+    asm volatile("movq %0, %%rax\n\t"
+                 "addq $1, %%rax\n\t"           \
+                 "movq %%rax, %0\n\t"           \
+                 : "+rm" (*ptr)                     \
+                 :                                  \
+                 : "rax", "cc", "memory");          \
+}
+
+static inline void delay_loop(unsigned long n, unsigned long *ptr) {
+    int j;
+    for (j = 0; j < (n); j++) {
+        fixed_size_work(ptr);
+    }
+}
+
+void delay_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	delay_loop(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void delay_nsec(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	delay_loop(delay_nsec / nspw, mem);
+}
+
+void *util_thread(void *arg) {
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT12100 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT12100 running on Linux CPU NG (%d)\n", rc);
+	}
+
+	passed = 1;
+
+	rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0);
+	if (rc != 0) {
+		fprintf(stderr, "CT12101 FUTEX_WAIT NG (%s)\n", strerror(errno));
+	} else {
+		fprintf(stderr, "CT12101 FUTEX_WAIT OK\n");
+	}
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	fprintf(stderr, "CT12001 futex START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT12002 pthread_create OK\n");
+
+ retry:
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(100000);
+
+	rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0);
+	if (rc != 1) {
+		fprintf(stderr, "CT12003 FUTEX_WAKE NG (%d,%s)\n", rc, strerror(errno));
+	} else {
+		fprintf(stderr, "CT12003 FUTEX_WAKE OK\n");
+	}
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT12004 pthread_join OK\n");
+
+	fprintf(stderr, "CT12005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT13.c b/test/uti/CT13.c
new file mode 100644
index 00000000..1f6105c1
--- /dev/null
+++ b/test/uti/CT13.c
@@ -0,0 +1,74 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+
+int passed = 0, sem = 0;
+pthread_t thr;
+
+void *util_thread(void *arg) {
+	int rc;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT13100 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT13100 running on Linux CPU NG (%d)\n", rc);
+	}
+
+ retry:
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(100000); /* debug messages via serial takes 0.05 sec */
+
+	rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0);
+	if (rc != 1) {
+		fprintf(stderr, "CT13101 FUTEX_WAKE NG (%d,%s)\n", rc, strerror(errno));
+	} else {
+		fprintf(stderr, "CT13101 FUTEX_WAKE OK\n");
+	}
+
+	return NULL;
+}
+
+int main(int argc, char **argv)
+{
+	int rc;
+
+	fprintf(stderr, "CT13001 futex START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT13002 pthread_create OK\n");
+
+	passed = 1;
+
+	rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0);
+	if (rc != 0) {
+		fprintf(stderr, "CT13003 FUTEX_WAIT NG (%s)\n", strerror(errno));
+	} else {
+		fprintf(stderr, "CT13003 FUTEX_WAIT OK\n");
+	}
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT13004 pthread_join OK\n");
+
+	fprintf(stderr, "CT13005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT14.c b/test/uti/CT14.c
new file mode 100644
index 00000000..279613fe
--- /dev/null
+++ b/test/uti/CT14.c
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex;
+int owned;
+pthread_t thr;
+
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+	unsigned long mem;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT14100 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT14100 running on Linux NG (%d)\n", rc);
+	}
+	errno = 0;
+
+	fwq(500 * 1000 * 1000UL, &mem); /* Sending debug messages through serial takes 0.05 sec */
+
+	pthread_mutex_lock(&mutex);
+	if (owned) {
+		fprintf(stderr, "CT14101 lock second OK\n");
+	} else {
+		fprintf(stderr, "CT14101 lock second NG\n");
+	}
+	owned = 1;
+	pthread_mutex_unlock(&mutex);
+
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	unsigned long mem;
+
+	pthread_mutex_init(&mutex, NULL);
+	fwq_init(&mem);
+
+	fprintf(stderr, "CT14001 futex START\n");
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT14002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno);
+		fflush(stderr);
+	} else {
+		fprintf(stderr, "CT14002 util_indicate_clone OK\n");
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT14003 pthread_create OK\n");
+
+	pthread_mutex_lock(&mutex);
+	if (!owned) {
+		fprintf(stderr, "CT14004 lock first OK\n");
+	} else {
+		fprintf(stderr, "CT14004 lock first NG\n");
+	}
+	owned = 1;
+	fwq(2000 * 1000 * 1000UL, &mem); /* Need 2 sec to make child sleep */
+	pthread_mutex_unlock(&mutex);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT14005 pthread_join OK\n");
+
+	fprintf(stderr, "CT14006 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT15.c b/test/uti/CT15.c
new file mode 100644
index 00000000..3c6306b0
--- /dev/null
+++ b/test/uti/CT15.c
@@ -0,0 +1,121 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex;
+int owned;
+pthread_t thr;
+
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+	unsigned long mem;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT14100 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT14100 running on Linux NG (%d)\n", rc);
+	}
+	errno = 0;
+
+	pthread_mutex_lock(&mutex);
+	if (!owned) {
+		fprintf(stderr, "CT14101 lock first OK\n");
+	} else {
+		fprintf(stderr, "CT14101 lock first NG\n");
+	}
+	owned = 1;
+	fwq(2000 * 1000 * 1000UL, &mem); /* Need 2 sec to make parent sleep */
+	pthread_mutex_unlock(&mutex);
+
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	unsigned long mem;
+
+	pthread_mutex_init(&mutex, NULL);
+	fwq_init(&mem);
+
+	fprintf(stderr, "CT14001 futex START\n");
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT14002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno);
+		fflush(stderr);
+	} else {
+		fprintf(stderr, "CT14002 util_indicate_clone OK\n");
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT14003 pthread_create OK\n");
+
+	fwq(500 * 1000 * 1000UL, &mem); /* Sending debug messages through serial takes 0.05 sec */
+
+	pthread_mutex_lock(&mutex);
+	if (owned) {
+		fprintf(stderr, "CT14004 lock second OK\n");
+	} else {
+		fprintf(stderr, "CT14004 lock second NG\n");
+	}
+	owned = 1;
+	pthread_mutex_unlock(&mutex);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT14005 pthread_join OK\n");
+
+	fprintf(stderr, "CT14006 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT16.c b/test/uti/CT16.c
new file mode 100644
index 00000000..ec29ccda
--- /dev/null
+++ b/test/uti/CT16.c
@@ -0,0 +1,83 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex;
+pthread_cond_t cond;
+int passed, flag;
+pthread_t thr;
+
+void *
+util_thread(void *arg)
+{
+	int rc;
+	unsigned long mem;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT16101 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT16101 running on Linux NG (%d)\n", rc);
+	}
+	errno = 0;
+
+	passed = 1;
+	pthread_mutex_lock(&mutex);
+	while(!flag) {
+		pthread_cond_wait(&cond, &mutex);
+	}
+	flag = 0;
+	pthread_mutex_unlock(&mutex);
+
+	fprintf(stderr, "CT16102 return from pthread_cond_wait() OK\n");
+
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	unsigned long mem;
+
+	pthread_mutex_init(&mutex, NULL);
+	pthread_cond_init(&cond, NULL);
+
+	fprintf(stderr, "CT16001 futex START\n");
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT16002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno);
+		fflush(stderr);
+	} else {
+		fprintf(stderr, "CT16002 util_indicate_clone OK\n");
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT16003 pthread_create OK\n");
+
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(100 * 1000UL); /* Send debug message through serial takes 0.05 sec */
+
+	pthread_mutex_lock(&mutex);
+	flag = 1;
+	pthread_cond_signal(&cond);
+	pthread_mutex_unlock(&mutex);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT16004 pthread_join OK\n");
+
+	fprintf(stderr, "CT16005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT17.c b/test/uti/CT17.c
new file mode 100644
index 00000000..795a002a
--- /dev/null
+++ b/test/uti/CT17.c
@@ -0,0 +1,81 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+pthread_mutex_t mutex;
+pthread_cond_t cond;
+int passed, flag;
+pthread_t thr;
+
+void *util_thread(void *arg) {
+	int rc;
+	unsigned long mem;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT17100 running on Linux OK\n");
+	else {
+		fprintf(stderr, "CT17100 running on Linux NG (%d)\n", rc);
+	}
+
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(100 * 1000UL); /* Send debug message through serial takes 0.05 sec */
+
+	pthread_mutex_lock(&mutex);
+	flag = 1;
+	pthread_cond_signal(&cond);
+	pthread_mutex_unlock(&mutex);
+
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	unsigned long mem;
+
+	pthread_mutex_init(&mutex, NULL);
+	pthread_cond_init(&cond, NULL);
+
+	fprintf(stderr, "CT17001 futex START\n");
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT17002 util_indicate_clone NG (rc=%d, errno=%d)\n", rc, errno);
+		fflush(stderr);
+	} else {
+		fprintf(stderr, "CT17002 util_indicate_clone OK\n");
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if(rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT17003 pthread_create OK\n");
+
+	passed = 1;
+	pthread_mutex_lock(&mutex);
+	fprintf(stderr, "CT17004 lock on %p OK\n", &mutex);
+	while(!flag) {
+		pthread_cond_wait(&cond, &mutex);
+		fprintf(stderr, "CT17005 wake on %p OK\n", &cond);
+	}
+	flag = 0;
+
+	pthread_mutex_unlock(&mutex);
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT17006 pthread_join OK\n");
+
+	fprintf(stderr, "CT17007 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT18.c b/test/uti/CT18.c
new file mode 100644
index 00000000..9ef11b78
--- /dev/null
+++ b/test/uti/CT18.c
@@ -0,0 +1,111 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+
+int passed, sem, flag;
+pthread_t thr;
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+
+void *util_thread(void *arg) {
+	int rc;
+	struct timespec start, timeout, end;
+	unsigned long elapsed;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT18101 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT18101 running on Linux CPU NG (%d)\n", rc);
+	}
+
+	passed = 1;
+
+	rc = clock_gettime(CLOCK_REALTIME, &start);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec);
+
+	timeout.tv_sec = start.tv_sec;
+	timeout.tv_nsec = start.tv_nsec + 800UL * 1000 * 1000;
+	if (timeout.tv_nsec > 1000UL * 1000 * 1000) {
+		timeout.tv_sec += 1;
+		timeout.tv_nsec -= 1000UL * 1000* 1000;
+	}
+	rc = syscall(__NR_futex, &sem, FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME, 0, &timeout, NULL, 0x12345678);
+	fprintf(stderr, "op=%x\n", FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME);
+
+	rc = clock_gettime(CLOCK_REALTIME, &end);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec);
+
+	if (rc != 0) {
+		fprintf(stderr, "CT18102 FUTEX_WAIT NG (%s)\n", strerror(errno));
+	} else {
+		fprintf(stderr, "CT18102 FUTEX_WAIT OK\n");
+	}
+
+	elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec);
+	if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) {
+		fprintf(stderr, "CT18103 timeout OK\n");
+	} else {
+		fprintf(stderr, "CT18103 timeout NG (%lx)\n", elapsed);
+	}
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	fprintf(stderr, "CT18001 futex START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT18002 pthread_create OK\n");
+
+ retry:
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(800 * 1000UL * 10);
+
+	flag = 1;
+	rc = syscall(__NR_futex, &sem, FUTEX_WAKE_BITSET, 1, NULL, NULL, 0x12345678);
+	if (rc != 0) {
+		fprintf(stderr, "CT18003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno));
+	} else {
+		fprintf(stderr, "CT18003 FUTEX_WAKE missing the waiter OK\n");
+	}
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT18004 pthread_join OK\n");
+
+	fprintf(stderr, "CT18005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT19.c b/test/uti/CT19.c
new file mode 100644
index 00000000..0d90168d
--- /dev/null
+++ b/test/uti/CT19.c
@@ -0,0 +1,112 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+
+int passed, sem, flag;
+pthread_t thr;
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+
+void *util_thread(void *arg) {
+	int rc;
+	struct timespec start, timeout, end;
+	unsigned long elapsed;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT19100 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT19100 running on Linux CPU NG (%d)\n", rc);
+	}
+
+	passed = 1;
+
+	rc = clock_gettime(CLOCK_MONOTONIC, &start);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec);
+
+	timeout.tv_sec = start.tv_sec;
+	timeout.tv_nsec = start.tv_nsec + 800UL * 1000 * 1000;
+	if (timeout.tv_nsec > 1000UL * 1000 * 1000) {
+		timeout.tv_sec += 1;
+		timeout.tv_nsec -= 1000UL * 1000* 1000;
+	}
+	/* timeout - clock_gettime(CLOCK_MONOTONIC) */
+	rc = syscall(__NR_futex, &sem, FUTEX_WAIT_BITSET, 0, &timeout, NULL, 0x12345678);
+	fprintf(stderr, "op=%x\n", FUTEX_WAIT_BITSET); 
+
+	rc = clock_gettime(CLOCK_MONOTONIC, &end);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec);
+
+	if (rc != 0) {
+		fprintf(stderr, "CT19101 FUTEX_WAIT NG (%s)\n", strerror(errno));
+	} else {
+		fprintf(stderr, "CT19101 FUTEX_WAIT OK\n");
+	}
+
+	elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec);
+	if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) {
+		fprintf(stderr, "CT19102 timeout OK\n");
+	} else {
+		fprintf(stderr, "CT19101 timeout NG\n");
+	}
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	fprintf(stderr, "CT19001 futex START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT19002 pthread_create OK\n");
+
+ retry:
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(2000 * 1000UL);
+
+	flag = 1;
+	rc = syscall(__NR_futex, &sem, FUTEX_WAKE_BITSET, 1, NULL, NULL, 0x12345678);
+	if (rc != 0) {
+		fprintf(stderr, "CT19003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno));
+	} else {
+		fprintf(stderr, "CT19003 FUTEX_WAKE missing the waiter OK\n");
+	}
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT19004 pthread_join OK\n");
+
+	fprintf(stderr, "CT19005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT20.c b/test/uti/CT20.c
new file mode 100644
index 00000000..42bdc973
--- /dev/null
+++ b/test/uti/CT20.c
@@ -0,0 +1,106 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+
+int passed, sem, flag;
+pthread_t thr;
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+
+void *util_thread(void *arg) {
+	int rc;
+	struct timespec start, timeout, end;
+	unsigned long elapsed;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT20100 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT20100 running on Linux CPU NG (%d)\n", rc);
+	}
+
+	passed = 1;
+
+	rc = clock_gettime(CLOCK_REALTIME, &start);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "start=%ld.%09ld\n", start.tv_sec, start.tv_nsec);
+
+	timeout.tv_sec = 0;
+	timeout.tv_nsec = 800ULL * 1000 * 1000;
+	rc = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, &timeout, NULL, 0);
+
+	rc = clock_gettime(CLOCK_REALTIME, &end);
+	if (rc != 0) {
+		fprintf(stderr, "clock_gettime failed\n");
+		return NULL;
+	}
+	fprintf(stderr, "end=%ld.%09ld\n", end.tv_sec, end.tv_nsec);
+
+	if (rc != 0) {
+		fprintf(stderr, "CT20101 FUTEX_WAIT NG (%s)\n", strerror(errno));
+	} else {
+		fprintf(stderr, "CT20101 FUTEX_WAIT OK\n");
+	}
+
+	elapsed = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec);
+	if (flag == 0 || elapsed < 800UL * 1000 * 1000 + 80UL * 1000 * 1000) {
+		fprintf(stderr, "CT20102 timeout OK\n");
+	} else {
+		fprintf(stderr, "CT20101 timeout NG\n");
+	}
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+
+	fprintf(stderr, "CT20001 futex START\n");
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_create(&thr, NULL, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT20002 pthread_create OK\n");
+
+ retry:
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+	usleep(2000 * 1000UL);
+
+	flag = 1;
+	rc = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0);
+	if (rc != 0) {
+		fprintf(stderr, "CT20003 FUTEX_WAKE missing the waiter NG (%d,%s)\n", rc, strerror(errno));
+	} else {
+		fprintf(stderr, "CT20003 FUTEX_WAKE missing the waiter OK\n");
+	}
+
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT20004 pthread_join OK\n");
+
+	fprintf(stderr, "CT20005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT21.c b/test/uti/CT21.c
new file mode 100644
index 00000000..8c9552d4
--- /dev/null
+++ b/test/uti/CT21.c
@@ -0,0 +1,210 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NTHR 1
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (93000) /* 93   usec */
+#define INIT_DELAY  (2000) /*  2   usec, CPU sends CTS packet */
+#define NIC_DELAY   (3000) /*  3   usec, NIC reads by RDMA-read  */
+#define POLL_DELAY  (200) /*    .2 usec, CPU fetces event queue entry from DRAM */
+#define RESP_DELAY  (2000) /*  2   usec, CPU sends DONE packet and updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+
+struct thr_arg {
+	int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args[NTHR];
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *progress_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int rc;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&arg->bar_lock);
+	while(arg->bar_count == 0) {
+		pthread_cond_wait(&arg->bar_cond, &arg->bar_lock);
+	}
+	pthread_mutex_unlock(&arg->bar_lock);
+
+	/* Start progress */
+	pthread_mutex_lock(&ep_lock);
+	while(1) {
+		if (terminate) {
+			break;
+		}
+
+		/* Event found */
+		if (nevents > 0) {
+			nevents = 0;
+		}
+
+		pthread_mutex_unlock(&ep_lock);
+		fwq(random() % 1000000000, &mem); /* 0 - 1 sec */
+		pthread_mutex_lock(&ep_lock);
+	}
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	struct timespec start, end;
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init(&mem);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	for(i = 0; i < NTHR; i++) {
+		thr_args[i].bar_count = 0;
+		pthread_cond_init(&thr_args[i].bar_cond, NULL);
+		pthread_mutex_init(&thr_args[i].bar_lock, NULL);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stdout);
+	}
+	for (i = 0; i < NTHR; i++) {
+		rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]);
+		if (rc){
+			fprintf(stdout, "pthread_create: %d\n", rc);
+			exit(1);
+		}
+	}
+	for (i = 0; i < NTHR; i++) {
+		pthread_mutex_lock(&thr_args[i].bar_lock);
+		thr_args[i].bar_count++;
+		pthread_cond_signal(&thr_args[i].bar_cond);
+		pthread_mutex_unlock(&thr_args[i].bar_lock);
+	}
+
+	fprintf(stdout, "CT09004 pthread_create OK\n");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < 10; i++) {
+		pthread_mutex_lock(&ep_lock);
+		nevents++;
+		fwq(random() % 1000000000, &mem); /* 0 - 1 sec */
+		pthread_mutex_unlock(&ep_lock);
+		while (nevents > 0) {
+			FIXED_SIZE_WORK(&mem);
+		}
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	terminate = 1;
+	
+	for (i = 0; i < NTHR; i++) {
+		pthread_join(thr_args[i].pthread, NULL);
+	}
+	fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	fprintf(stdout, "CT09006 END\n");
+
+
+	exit(0);
+}
diff --git a/test/uti/CT22.c b/test/uti/CT22.c
new file mode 100644
index 00000000..627b1beb
--- /dev/null
+++ b/test/uti/CT22.c
@@ -0,0 +1,210 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NTHR 1
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (93000) /* 93   usec */
+#define INIT_DELAY  (2000) /*  2   usec, CPU sends CTS packet */
+#define NIC_DELAY   (3000) /*  3   usec, NIC reads by RDMA-read  */
+#define POLL_DELAY  (200) /*    .2 usec, CPU fetces event queue entry from DRAM */
+#define RESP_DELAY  (2000) /*  2   usec, CPU sends DONE packet and updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+
+struct thr_arg {
+	int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args[NTHR];
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *progress_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int rc;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&arg->bar_lock);
+	while(arg->bar_count == 0) {
+		pthread_cond_wait(&arg->bar_cond, &arg->bar_lock);
+	}
+	pthread_mutex_unlock(&arg->bar_lock);
+
+	for (i = 0; i < 100; i++) {
+		pthread_mutex_lock(&ep_lock);
+		nevents++;
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_unlock(&ep_lock);
+		while (nevents > 0) {
+			FIXED_SIZE_WORK(&mem);
+		}
+	}
+	terminate = 1;
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	struct timespec start, end;
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init(&mem);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	for(i = 0; i < NTHR; i++) {
+		thr_args[i].bar_count = 0;
+		pthread_cond_init(&thr_args[i].bar_cond, NULL);
+		pthread_mutex_init(&thr_args[i].bar_lock, NULL);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stdout);
+	}
+	for (i = 0; i < NTHR; i++) {
+		rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]);
+		if (rc){
+			fprintf(stdout, "pthread_create: %d\n", rc);
+			exit(1);
+		}
+	}
+	for (i = 0; i < NTHR; i++) {
+		pthread_mutex_lock(&thr_args[i].bar_lock);
+		thr_args[i].bar_count++;
+		pthread_cond_signal(&thr_args[i].bar_cond);
+		pthread_mutex_unlock(&thr_args[i].bar_lock);
+	}
+
+	fprintf(stdout, "CT09004 pthread_create OK\n");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	/* Start progress */
+	pthread_mutex_lock(&ep_lock);
+	while(1) {
+		if (terminate) {
+			break;
+		}
+
+		/* Event found */
+		if (nevents > 0) {
+			nevents = 0;
+		}
+
+		pthread_mutex_unlock(&ep_lock);
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_lock(&ep_lock);
+	}
+	pthread_mutex_unlock(&ep_lock);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	for (i = 0; i < NTHR; i++) {
+		pthread_join(thr_args[i].pthread, NULL);
+	}
+	fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	fprintf(stdout, "CT09006 END\n");
+
+
+	exit(0);
+}
diff --git a/test/uti/CT23.c b/test/uti/CT23.c
new file mode 100644
index 00000000..69a19991
--- /dev/null
+++ b/test/uti/CT23.c
@@ -0,0 +1,212 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NTHR 1
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (93000) /* 93   usec */
+#define INIT_DELAY  (2000) /*  2   usec, CPU sends CTS packet */
+#define NIC_DELAY   (3000) /*  3   usec, NIC reads by RDMA-read  */
+#define POLL_DELAY  (200) /*    .2 usec, CPU fetces event queue entry from DRAM */
+#define RESP_DELAY  (2000) /*  2   usec, CPU sends DONE packet and updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+
+pthread_cond_t ep_cond;
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+
+struct thr_arg {
+	int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args[NTHR];
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *progress_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int rc;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&arg->bar_lock);
+	while(arg->bar_count == 0) {
+		pthread_cond_wait(&arg->bar_cond, &arg->bar_lock);
+	}
+	pthread_mutex_unlock(&arg->bar_lock);
+
+	/* Start progress */
+	pthread_mutex_lock(&ep_lock);
+	while(1) {
+		if (terminate) {
+			break;
+		}
+		while(nevents == 0) {
+			pthread_cond_wait(&ep_cond, &ep_lock);
+		}
+		nevents = 0;
+		pthread_mutex_unlock(&ep_lock);
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_lock(&ep_lock);
+	}
+	pthread_mutex_unlock(&ep_lock);
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	struct timespec start, end;
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init(&mem);
+	pthread_cond_init(&ep_cond, NULL);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	for(i = 0; i < NTHR; i++) {
+		thr_args[i].bar_count = 0;
+		pthread_cond_init(&thr_args[i].bar_cond, NULL);
+		pthread_mutex_init(&thr_args[i].bar_lock, NULL);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stdout);
+	}
+	for (i = 0; i < NTHR; i++) {
+		rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]);
+		if (rc){
+			fprintf(stdout, "pthread_create: %d\n", rc);
+			exit(1);
+		}
+	}
+	for (i = 0; i < NTHR; i++) {
+		pthread_mutex_lock(&thr_args[i].bar_lock);
+		thr_args[i].bar_count++;
+		pthread_cond_signal(&thr_args[i].bar_cond);
+		pthread_mutex_unlock(&thr_args[i].bar_lock);
+	}
+
+	fprintf(stdout, "CT09004 pthread_create OK\n");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < 100; i++) {
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_lock(&ep_lock);
+		nevents++;
+		pthread_cond_signal(&ep_cond);
+		pthread_mutex_unlock(&ep_lock);
+		while (nevents > 0) {
+			FIXED_SIZE_WORK(&mem);
+		}
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	terminate = 1;
+	
+	for (i = 0; i < NTHR; i++) {
+		pthread_join(thr_args[i].pthread, NULL);
+	}
+	fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	fprintf(stdout, "CT09006 END\n");
+
+
+	exit(0);
+}
diff --git a/test/uti/CT24.c b/test/uti/CT24.c
new file mode 100644
index 00000000..fcde9496
--- /dev/null
+++ b/test/uti/CT24.c
@@ -0,0 +1,210 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NTHR 1
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (93000) /* 93   usec */
+#define INIT_DELAY  (2000) /*  2   usec, CPU sends CTS packet */
+#define NIC_DELAY   (3000) /*  3   usec, NIC reads by RDMA-read  */
+#define POLL_DELAY  (200) /*    .2 usec, CPU fetces event queue entry from DRAM */
+#define RESP_DELAY  (2000) /*  2   usec, CPU sends DONE packet and updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+
+pthread_cond_t ep_cond;
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+
+struct thr_arg {
+	int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args[NTHR];
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *progress_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int rc;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+		return NULL;
+	}
+
+	pthread_mutex_lock(&arg->bar_lock);
+	while(arg->bar_count == 0) {
+		pthread_cond_wait(&arg->bar_cond, &arg->bar_lock);
+	}
+	pthread_mutex_unlock(&arg->bar_lock);
+
+	for (i = 0; i < 100; i++) {
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_lock(&ep_lock);
+		nevents++;
+		pthread_cond_signal(&ep_cond);
+		pthread_mutex_unlock(&ep_lock);
+		while (nevents > 0) {
+			FIXED_SIZE_WORK(&mem);
+		}
+	}
+	terminate = 1;
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	struct timespec start, end;
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init(&mem);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	for(i = 0; i < NTHR; i++) {
+		thr_args[i].bar_count = 0;
+		pthread_cond_init(&thr_args[i].bar_cond, NULL);
+		pthread_mutex_init(&thr_args[i].bar_lock, NULL);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stdout, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stdout);
+	}
+	for (i = 0; i < NTHR; i++) {
+		rc = pthread_create(&thr_args[i].pthread, NULL, progress_fn, &thr_args[i]);
+		if (rc){
+			fprintf(stdout, "pthread_create: %d\n", rc);
+			exit(1);
+		}
+	}
+	for (i = 0; i < NTHR; i++) {
+		pthread_mutex_lock(&thr_args[i].bar_lock);
+		thr_args[i].bar_count++;
+		pthread_cond_signal(&thr_args[i].bar_cond);
+		pthread_mutex_unlock(&thr_args[i].bar_lock);
+	}
+
+	fprintf(stdout, "CT09004 pthread_create OK\n");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	/* Start progress */
+	pthread_mutex_lock(&ep_lock);
+	while(1) {
+		if (terminate) {
+			break;
+		}
+		while(nevents == 0) {
+			pthread_cond_wait(&ep_cond, &ep_lock);
+		}
+		nevents = 0;
+		pthread_mutex_unlock(&ep_lock);
+		fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_lock(&ep_lock);
+	}
+	pthread_mutex_unlock(&ep_lock);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	for (i = 0; i < NTHR; i++) {
+		pthread_join(thr_args[i].pthread, NULL);
+	}
+	fprintf(stdout, "CT09005 takes %ld nsec INFO\n", TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	fprintf(stdout, "CT09006 END\n");
+
+
+	exit(0);
+}
diff --git a/test/uti/CT25.c b/test/uti/CT25.c
new file mode 100644
index 00000000..1aa5dd4d
--- /dev/null
+++ b/test/uti/CT25.c
@@ -0,0 +1,163 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+#include <semaphore.h>
+
+pthread_t thr;
+
+unsigned long mem; /* delay functions issue ld/st instructions on this address */
+double nspw; /* nsec per work */
+
+sem_t sem_kick, sem_report;
+int nentry, szentry;
+char **sendv, **recvv;
+
+
+
+/* Timer related macros */
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define N_INIT 10000000
+
+static inline void fixed_size_work(unsigned long *ptr) {
+    asm volatile("movq %0, %%rax\n\t"
+                 "addq $1, %%rax\n\t"           \
+                 "movq %%rax, %0\n\t"           \
+                 : "+rm" (*ptr)                     \
+                 :                                  \
+                 : "rax", "cc", "memory");          \
+}
+
+static inline void delay_loop(unsigned long n, unsigned long *ptr) {
+    int j;
+    for (j = 0; j < (n); j++) {
+        fixed_size_work(ptr);
+    }
+}
+
+void delay_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	delay_loop(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void delay_nsec(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	delay_loop(delay_nsec / nspw, mem);
+}
+
+void *util_thread(void *arg) {
+	int rc;
+	int i;
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT25101 running on Linux CPU OK\n");
+	else {
+		fprintf(stderr, "CT25101 running on Linux CPU NG (%d)\n", rc);
+	}
+
+	sem_wait(&sem_kick);
+
+	/* Cause remote page fault */
+	for (i = 0; i < nentry; i++) {
+		memset(recvv[i], 0, szentry);
+	}
+
+	sem_post(&sem_report);
+
+	return NULL;
+}
+
+pid_t gettid(void)
+{
+    return syscall(SYS_gettid);
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret = 0;
+	int rc;
+	int i;
+	pthread_attr_t attr;
+
+    if(argc == 3) {
+        szentry = (1ULL << atoi(argv[1]));
+        nentry = atoi(argv[2]);
+    }
+
+	if (argc != 3 || szentry == 0) {
+		fprintf(stderr, "usage: CT25 <log-size of one buffer entry> <# of entries>\n");
+		ret = 1;
+		goto fn_fail;
+	}
+
+    sem_init(&sem_kick, 0, 0);
+    sem_init(&sem_report, 0, 0);
+
+	fprintf(stderr, "CT25001 START\n");
+	fprintf(stderr, "CT25001 INFO (pid=%d,tid=%d)\n", getpid(), gettid());
+
+	sendv = malloc(sizeof(char *) * nentry);
+	if(!sendv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		memset(sendv[i], 0xaa, szentry);
+	}
+
+	recvv = malloc(sizeof(char *) * nentry);
+	if(!recvv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+		if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		memset(recvv[i], 0, szentry);
+	}
+
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT25002 util_indicate_clone INFO (rc=%d, errno=%d)\n", rc, errno);
+	} else {
+		fprintf(stderr, "CT25002 util_indicate_clone OK\n", rc, errno);
+	}
+
+	pthread_attr_init(&attr);
+	//pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	rc = pthread_create(&thr, &attr, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT25002 pthread_create OK\n");
+
+	sem_post(&sem_kick);
+	sem_wait(&sem_report);
+
+	pthread_join(thr, NULL);
+
+	fprintf(stderr, "CT25003 END\n");
+	ret = 0;
+
+ fn_exit:
+	exit(ret);
+
+ fn_fail:
+	goto fn_exit;
+}
diff --git a/test/uti/CT26.c b/test/uti/CT26.c
new file mode 100644
index 00000000..4ca3a8b8
--- /dev/null
+++ b/test/uti/CT26.c
@@ -0,0 +1,139 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+#include <signal.h>
+
+int passed = 0;
+pthread_t thr;
+
+unsigned long mem; /* delay functions issue ld/st instructions on this address */
+double nspw; /* nsec per work */
+
+/* Timer related macros */
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define N_INIT 10000000
+
+static inline void fixed_size_work(unsigned long *ptr) {
+    asm volatile("movq %0, %%rax\n\t"
+                 "addq $1, %%rax\n\t"           \
+                 "movq %%rax, %0\n\t"           \
+                 : "+rm" (*ptr)                     \
+                 :                                  \
+                 : "rax", "cc", "memory");          \
+}
+
+static inline void delay_loop(unsigned long n, unsigned long *ptr) {
+    int j;
+    for (j = 0; j < (n); j++) {
+        fixed_size_work(ptr);
+    }
+}
+
+void delay_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	delay_loop(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void delay_nsec(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	delay_loop(delay_nsec / nspw, mem);
+}
+
+void halt(int sig) {
+	while(1) { }
+}
+
+void *util_thread(void *arg) {
+	int rc;
+	fprintf(stderr, "CT12101 enter OK\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stderr, "CT12102 running on Linux CPU OK (tid=%d)\n", syscall(__NR_gettid));
+	else {
+		fprintf(stderr, "CT12102 running on Linux CPU NG (tid=%d,rc=%d)\n", syscall(__NR_gettid), rc);
+	}
+
+	passed = 1;
+
+	rc = syscall(888);
+	if (rc != -1) {
+		fprintf(stderr, "CT12103 syscall(888) OK (%x)\n", rc);
+	} else {
+		fprintf(stderr, "CT12103 syscall(888) NG (%x)\n", rc);
+	}
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+	pthread_attr_t attr;
+	struct sigaction act;
+
+	fprintf(stderr, "CT12001 futex START (tid=%d)\n", syscall(__NR_gettid));
+#if 0
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "util_indicate_clone rc=%d, errno=%d\n", rc, errno);
+		fflush(stderr);
+	}
+#endif
+	sigaction(SIGINT, NULL, &act);
+	act.sa_handler = halt;
+	act.sa_flags &= ~(SA_RESTART);
+	sigaction(SIGINT, &act, NULL);
+
+	rc = pthread_attr_init(&attr);
+	if (rc){
+		fprintf(stderr, "pthread_attr_init: %d\n", rc);
+		exit(1);
+	}
+#if 0
+	rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	if (rc){
+		fprintf(stderr, "pthread_attr_setdetachstate: %d\n", rc);
+		exit(1);
+	}
+#endif
+	rc = pthread_create(&thr, &attr, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT12002 pthread_create OK\n");
+
+#if 1
+	while (!passed) {
+		asm volatile("pause" ::: "memory"); 
+	}
+#endif
+	usleep(200000);
+
+#if 1
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT12004 pthread_join OK\n");
+#endif
+	//fprintf(stderr, "CT12005 END\n");
+	exit(0);
+}
diff --git a/test/uti/CT27.c b/test/uti/CT27.c
new file mode 100644
index 00000000..65df96df
--- /dev/null
+++ b/test/uti/CT27.c
@@ -0,0 +1,497 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NPROC 1
+#define MAX_NOPS 10
+int NOPS=1;/* RDMA:1, accumulate:10 */
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_CPU  (100000)  /* 100,000 nsec, CPU time for calculation */
+#define I2R_OCC     ( 200)  /*  200 nsec, occupation time for for sending AM packet */
+#define I2R_NET     (1000)  /*  1,000   nsec, Network time for packet to arrive at responder  */
+int R2I_OCC=    (10200/*400*/);  /*  RDMA:10,200 nsec, accumulate:400ns, occupation time for perforing accumulate or RDMA-RD and sending ACK packet . Note that 10GB/s means 100KB/10,000 ns */
+#define R2I_NET     (1000)  /*  1000   nsec, Network time for packet to arrive at initiator */
+#define POLL_CPU       ( 200) /*  200 nsec, CPU time for checking DRAM event queue */
+#define REQ_UPDATE_CPU ( 200) /*  200 nsec, CPU time for updates MPI_Request */
+#define NSPIN 1
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+struct thr_arg {
+	int rank;
+	volatile int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+
+	pthread_mutex_t ep_lock; /* mutex for endpoint manipulation */
+	volatile long ini_ev[MAX_NOPS]; /* events on the responder */
+	volatile long res_ev[MAX_NOPS]; /* events on the initiator */
+	volatile int terminate;
+	long ini_busy; /* Initiator is busy sending AM packet or RTS packet etc. */
+	long res_busy; /* Responder is busy doing accumulate or RDMA-RD etc. */
+};
+
+struct per_proc {
+	int rank;
+	struct thr_arg thr_arg;
+	long nsec;
+
+};
+
+struct proc_glb {
+	struct per_proc per_procs[NPROC];
+	volatile int bar_count;
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+};
+
+struct proc_glb *proc_glb;
+
+unsigned long mem; /* Per-thread storage */
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init() {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) {
+		printf("%s: delay_nsec<0\n", __FUNCTION__);
+	}
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	bulk_fsw(delay_nsec / nspw);
+}
+int progress_responder(struct thr_arg *thr_arg) {
+	int ret = 0;
+	int j;
+	struct timespec now_ts;
+	long now_long;
+	clock_gettime(CLOCK_REALTIME, &now_ts);
+	now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec);
+
+	pthread_mutex_lock(&thr_arg->ep_lock); /* This lock is for consistency */
+	for (j = 0; j < NOPS; j++) {
+		if (thr_arg->res_busy <= now_long && thr_arg->res_ev[j] && thr_arg->res_ev[j] <= now_long) {
+			//if(thr_arg->rank == 0) { printf("res_ev=%ld,busy=%ld,now=%ld\n", thr_arg->res_ev[j] % 1000000000UL, thr_arg->res_busy % 1000000000UL, now_long  % 1000000000UL); }
+			thr_arg->ini_ev[j] = now_long + R2I_OCC + R2I_NET;
+			thr_arg->res_ev[j] = 0;
+			thr_arg->res_busy = now_long + R2I_OCC; /* responder is busy for AM or RDMA-RD etc. */
+			ret = 1;
+		}
+	}
+	pthread_mutex_unlock(&thr_arg->ep_lock);
+	return ret;
+}
+
+int progress_initiator(struct thr_arg* thr_arg) {
+	int ret = 0;
+	int j;
+	struct timespec now_ts;
+	long now_long;
+	clock_gettime(CLOCK_REALTIME, &now_ts);
+	now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec);
+
+	pthread_mutex_lock(&thr_arg->ep_lock);
+	for (j = 0; j < NOPS; j++) {
+		//if(thr_arg->rank == 0) { printf("ini_ev=%ld,now=%ld\n", thr_arg->ini_ev[j], now_long); }
+		if (thr_arg->ini_busy <= now_long && thr_arg->ini_ev[j] && thr_arg->ini_ev[j] <= now_long) {
+			fwq(POLL_CPU); /* Account for cache miss */
+			fwq(REQ_UPDATE_CPU);
+			now_long += POLL_CPU + REQ_UPDATE_CPU;
+			thr_arg->ini_ev[j] = 0; /* Event is consumed */
+			thr_arg->ini_busy = now_long;
+			ret = 1;
+		}
+	}
+	pthread_mutex_unlock(&thr_arg->ep_lock);
+	return ret;
+}
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int spin_count = 0;
+	int i, j;
+	struct timespec now_ts;
+	long now_long;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+	}
+
+	printf("progress,enter,rank=%d\n", thr_arg->rank);
+
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count++;
+	if (thr_arg->bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("[%d] pthread_cond_broadcast failed,rc=%d\n", thr_arg->rank, rc);
+		}
+	}
+	while (thr_arg->bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("[%d] pthread_cond_wait failed,rc=%d\n", thr_arg->rank, rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+
+	printf("[%d] progress,after barrier\n", thr_arg->rank);
+	//#define NO_ASYNC
+#ifdef NO_ASYNC
+	return NULL;
+#endif
+	/* Start progress */
+	while(1) {
+		if (thr_arg->terminate) {
+			break;
+		}
+
+		if (progress_responder(thr_arg)) {
+			//if (thr_arg->rank == 0) { printf("progress_fn, responder progressed\n"); }
+		}
+
+		if (progress_initiator(thr_arg)) {
+			//if (thr_arg->rank == 0) { printf("progress_fn, initiator progressed\n"); }
+		}
+
+		spin_count++;
+		if (spin_count >= NSPIN) {
+			spin_count = 0;
+			sched_yield();
+		}
+	}
+	printf("progress,exit,rank=%d\n", thr_arg->rank);
+	return NULL;
+}
+
+void parent_fn(struct per_proc *per_proc) {
+	int i, j;
+	int rc;
+	char* uti_str;
+	int uti_val;
+	struct timespec start, end;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+	struct timespec now_ts;
+	long now_long;
+
+	printf("[%d] parent_fn,enter,proc_glb=%p,bar_count=%d\n", per_proc->rank, proc_glb, proc_glb->bar_count);
+
+	pthread_mutex_lock(&proc_glb->bar_lock);
+	proc_glb->bar_count++;
+	if (proc_glb->bar_count == NPROC) {
+		if ((rc = pthread_cond_broadcast(&proc_glb->bar_cond))) {
+			printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc);
+		}
+	}
+	while (proc_glb->bar_count != NPROC) {
+		if ((rc = pthread_cond_wait(&proc_glb->bar_cond, &proc_glb->bar_lock))) {
+			printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc);
+		}
+    }
+	pthread_mutex_unlock(&proc_glb->bar_lock);
+
+	//printf("[%d] parent,after barrier\n", per_proc->rank);
+
+	pthread_mutexattr_init(&mutexattr);
+	//pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&per_proc->thr_arg.ep_lock, &mutexattr);
+
+	per_proc->thr_arg.bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	//pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&per_proc->thr_arg.bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	//pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&per_proc->thr_arg.bar_lock, &mutexattr);
+
+	uti_str = getenv("DISABLE_UTI");
+	uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n", rc);
+	}
+
+	per_proc->thr_arg.rank = per_proc->rank;
+	rc = pthread_create(&per_proc->thr_arg.pthread, NULL, progress_fn, &per_proc->thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	pthread_mutex_lock(&per_proc->thr_arg.bar_lock);
+	per_proc->thr_arg.bar_count++;
+	if (per_proc->thr_arg.bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&per_proc->thr_arg.bar_cond))) {
+			printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc);
+		}
+	}
+	while (per_proc->thr_arg.bar_count != 2) {
+		if ((rc = pthread_cond_wait(&per_proc->thr_arg.bar_cond, &per_proc->thr_arg.bar_lock))) {
+			printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc);
+		}
+    }
+	pthread_mutex_unlock(&per_proc->thr_arg.bar_lock);
+
+	printf("[%d] parent,after barrier\n", per_proc->rank);
+	//fprintf(stdout, "CT09004 pthread_create OK\n");
+
+	//#define TIMER_KIND CLOCK_THREAD_CPUTIME_ID
+#define TIMER_KIND CLOCK_REALTIME
+	clock_gettime(TIMER_KIND, &start);
+	for (i = 0; i < 10000; i++) { /* It takes 1 sec */
+
+		/* Send request-to-send packet */
+		clock_gettime(CLOCK_REALTIME, &now_ts);
+		now_long = TS2NS(now_ts.tv_sec, now_ts.tv_nsec);
+	
+		for (j = 0; j < NOPS; j++) {
+			pthread_mutex_lock(&per_proc->thr_arg.ep_lock); /* Lock is taken per MPI_Accumulate() */
+			fwq(I2R_OCC);
+			now_long += I2R_OCC;
+			per_proc->thr_arg.res_ev[j] = now_long + I2R_NET;
+			per_proc->thr_arg.ini_busy = now_long;
+			//printf("res_ev=%ld,ini_busy=%ld,now=%ld\n", per_proc->thr_arg.res_ev[j] % 1000000000UL, per_proc->thr_arg.ini_busy % 1000000000UL, now_long  % 1000000000UL);
+			pthread_mutex_unlock(&per_proc->thr_arg.ep_lock);
+		}
+
+		/* Start calculation */
+		fwq(CALC_CPU);
+
+		/* Progress responder and initiator */
+		int more_reap_needed;
+		while (1) {
+			if (progress_responder(&per_proc->thr_arg)) {
+				//printf("parent_fn, responder progressed\n");
+			}
+
+			if (progress_initiator(&per_proc->thr_arg)) {
+				//printf("parent_fn, initiator progressed\n");
+			}
+
+			more_reap_needed = 0;
+			for (j = 0; j < NOPS; j++) {
+				if (per_proc->thr_arg.res_ev[j] || per_proc->thr_arg.ini_ev[j]) {
+					more_reap_needed = 1;
+					break;
+				}
+			}
+			if (!more_reap_needed) {
+				break;
+			}
+		}
+	}
+	clock_gettime(TIMER_KIND, &end);
+	
+	per_proc->thr_arg.terminate = 1;
+	pthread_join(per_proc->thr_arg.pthread, NULL);
+
+	per_proc->nsec = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	char *uti_str;
+	int uti_val;
+	int st;
+	pid_t pid;
+	long max;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+	int fd;
+	key_t key = ftok(argv[0], 0);
+	int shmid;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "+ar", options, NULL)) != -1) {
+		switch (opt) {
+		case 'a': /* accumulate */
+			NOPS = 10; /* ten accumulates */
+			R2I_OCC = 400; /* 200 ns to accumulate, 200 ns to send ACK */
+			break;
+		case 'r':
+			NOPS = 6; /* 3D stencil, RDMA */
+			R2I_OCC = 10200; /* 10000 ns to RDMA-RD, 200 ns to send DONE */
+				break;
+		default: /* '?' */
+			printf("usage: [-a] [-r]");
+			exit(1);
+		}
+	}
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init();
+
+#define SHMPOSIX 1
+#define SHMSYSV 2
+#define SHMANON 3
+#define SHM_METHOD SHMPOSIX
+#if SHM_METHOD==SHMPOSIX
+	printf("posix1\n");
+	if((fd = shm_open("/CT27", O_RDWR | O_CREAT, 0644)) == -1) {
+		fprintf(stdout, "shm_open failed\n");
+	}
+	if(ftruncate(fd, sizeof(struct proc_glb))) {
+		fprintf(stdout, "ftruncate failed\n");
+	}
+	proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "mmap failed\n");
+		exit(1);
+	}
+#elif SHM_METHOD==SHMSYSV
+	printf("sysv1\n");
+    if ((shmid = shmget(key, sizeof(struct proc_glb), IPC_CREAT | 0660)) == -1) {
+		fprintf(stdout, "shmget failed: %s\n", strerror(errno));
+	}
+	proc_glb = shmat(shmid, NULL, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "shmat failed\n");
+		exit(1);
+	}
+#elif SHM_METHOD==SHMANON
+	printf("anon1\n");
+	proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED/* | MAP_ANONYMOUS*/, /*-1*/fd, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "mmap failed\n");
+		exit(1);
+	}
+#endif
+
+	memset(proc_glb, 0, sizeof(struct proc_glb));
+
+	proc_glb->bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&proc_glb->bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&proc_glb->bar_lock, &mutexattr);
+
+	for (i = 0; i < NPROC; i++) {
+		proc_glb->per_procs[i].rank = i;
+		printf("[0] i=%d,rank=%d\n", i, proc_glb->per_procs[i].rank);
+	}
+	for (i = 1; i < NPROC; i++) {
+		pid = fork();
+		if(pid < 0) {
+			fprintf(stdout, "fork failed: %s\n", strerror(errno));
+			exit(1);
+		} else if (pid == 0) {
+#if SHM_METHOD==SHMSYSV
+	printf("sysv2\n");
+			proc_glb = shmat(shmid, NULL, 0);
+#endif
+			printf("[%d] rank=%d\n", i, proc_glb->per_procs[i].rank);
+			parent_fn(&proc_glb->per_procs[i]);
+			exit(0);
+		}
+	}
+	parent_fn(&proc_glb->per_procs[0]);
+	
+	while ((pid = waitpid(-1, &st, __WALL)) > 0);
+
+	max = -1;
+	for (i = 0; i < NPROC; i++) {
+		if (max < proc_glb->per_procs[i].nsec) {
+			max = proc_glb->per_procs[i].nsec;
+		}
+	}
+
+	fprintf(stderr, "max %ld nsec\n", max);
+	fprintf(stdout, "CT09006 END\n");
+}
+
diff --git a/test/uti/CT27.sh b/test/uti/CT27.sh
new file mode 100755
index 00000000..7561b523
--- /dev/null
+++ b/test/uti/CT27.sh
@@ -0,0 +1,64 @@
+#!/usr/bin/bash
+MYHOME="/work/gg10/e29005"
+MCK="${MYHOME}/project/os/install"
+MCEXEC=
+MCEXECOPT="--enable-uti"
+export DISABLE_UTI=0
+
+stop=0
+reset=0
+go=0
+acc=0
+nodes="c[8195]"
+
+while getopts srgamd OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reset=1
+                ;;
+            g) go=1
+                ;;
+	    a) acc=1 # accumulate, otherwise RDMA
+		;;
+            m) 
+		MCEXEC="${MCK}/bin/mcexec"
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${acc} -eq 1 ]; then
+    exeopt="-a"
+else
+    exeopt="-r"
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcstop+release.sh
+fi
+
+if [ ${reset} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1
+fi
+
+if [ ${go} -eq 1 ]; then
+    make
+
+    > ./log
+    for i in {1..10}; do (${MCEXEC} ${MCEXECOPT} taskset -c 0-7 ./CT27 $exeopt 1>/dev/null 2>> ./log); done
+    perl CT11.pl < ./log
+    #${MCEXEC} ${MCEXECOPT} taskset -c 0-7 ./CT27 $exeopt
+fi
diff --git a/test/uti/CT28.c b/test/uti/CT28.c
new file mode 100644
index 00000000..d579fb52
--- /dev/null
+++ b/test/uti/CT28.c
@@ -0,0 +1,441 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/shm.h>
+#include <fcntl.h>
+#include <signal.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define	dprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#define	eprintf(...)											\
+	do {														\
+		char msg[1024];											\
+		sprintf(msg, __VA_ARGS__);								\
+		fprintf(stdout, "%s,%s", __FUNCTION__, msg);			\
+	} while (0);
+#else
+#define dprintf(...) do {  } while (0)
+#define eprintf(...) do {  } while (0)
+#endif
+
+#define NPROC 8
+#define NINC 10000
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define NSPIN 1
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+struct thr_arg {
+	int rank;
+	volatile int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+
+	pthread_mutex_t ep_lock; /* mutex for endpoint manipulation */
+	volatile long count; /* events on the responder */
+	volatile int terminate;
+};
+
+struct per_proc {
+	int rank;
+	struct thr_arg thr_arg;
+	long nsec;
+};
+
+struct proc_glb {
+	struct per_proc per_procs[NPROC];
+	volatile int bar_count;
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+};
+
+struct proc_glb *proc_glb;
+
+unsigned long mem; /* Per-thread storage */
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+void fwq_init() {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) {
+		printf("%s: delay_nsec<0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+
+void init_bar(struct thr_arg* thr_arg) {
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count= 0;
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+}
+
+void bar(struct thr_arg* thr_arg) {
+	int rc;
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count++;
+	if (thr_arg->bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("[%d] pthread_cond_broadcast failed,rc=%d\n", thr_arg->rank, rc);
+		}
+	}
+	while (thr_arg->bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("[%d] pthread_cond_wait failed,rc=%d\n", thr_arg->rank, rc);
+		}
+	}
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+}
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int spin_count = 0;
+	int i, j;
+	struct timespec now_ts;
+	long now_long;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG\n", rc);
+	}
+
+	printf("[%d] progress,enter,", thr_arg->rank);
+	print_cpu_last_executed_on();
+
+	bar(thr_arg);
+
+	printf("[%d] progress,after barrier\n", thr_arg->rank);
+
+	for (i = 0; i < NINC; i++) {
+		pthread_mutex_lock(&thr_arg->ep_lock);
+		thr_arg->count++;
+		pthread_mutex_unlock(&thr_arg->ep_lock);
+		sched_yield();
+	}
+
+	bar(thr_arg);
+	printf("progress,exit,rank=%d\n", thr_arg->rank);
+
+	return NULL;
+}
+
+#define TIMER_KIND CLOCK_THREAD_CPUTIME_ID
+//#define TIMER_KIND CLOCK_REALTIME
+
+void parent_fn(struct per_proc *per_proc) {
+	int i, j;
+	int rc;
+	char* uti_str;
+	int uti_val;
+	struct timespec start, end;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+	struct timespec now_ts;
+	long now_long;
+
+	printf("[%d] parent_fn,enter,", per_proc->rank);
+	print_cpu_last_executed_on();
+
+	pthread_mutex_lock(&proc_glb->bar_lock);
+	proc_glb->bar_count++;
+	if (proc_glb->bar_count == NPROC) {
+		if ((rc = pthread_cond_broadcast(&proc_glb->bar_cond))) {
+			printf("[%d] pthread_cond_broadcast failed,rc=%d\n", per_proc->rank, rc);
+		}
+	}
+	while (proc_glb->bar_count != NPROC) {
+		if ((rc = pthread_cond_wait(&proc_glb->bar_cond, &proc_glb->bar_lock))) {
+			printf("[%d] pthread_cond_wait failed,rc=%d\n", per_proc->rank, rc);
+		}
+    }
+	pthread_mutex_unlock(&proc_glb->bar_lock);
+
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutex_init(&per_proc->thr_arg.ep_lock, &mutexattr);
+
+	per_proc->thr_arg.bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	pthread_cond_init(&per_proc->thr_arg.bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutex_init(&per_proc->thr_arg.bar_lock, &mutexattr);
+
+	uti_str = getenv("DISABLE_UTI");
+	uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n", rc);
+	}
+
+	per_proc->thr_arg.rank = per_proc->rank;
+	rc = pthread_create(&per_proc->thr_arg.pthread, NULL, progress_fn, &per_proc->thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	init_bar(&per_proc->thr_arg);
+	bar(&per_proc->thr_arg);
+
+	printf("[%d] parent,after barrier\n", per_proc->rank);
+
+	clock_gettime(TIMER_KIND, &start);
+	for (i = 0; i < NINC; i++) {
+		pthread_mutex_lock(&per_proc->thr_arg.ep_lock); /* Lock is taken per MPI_Accumulate() */
+		per_proc->thr_arg.count++;
+		pthread_mutex_unlock(&per_proc->thr_arg.ep_lock);
+	}
+	init_bar(&per_proc->thr_arg);
+	bar(&per_proc->thr_arg);
+	clock_gettime(TIMER_KIND, &end);
+	
+	pthread_join(per_proc->thr_arg.pthread, NULL);
+
+	per_proc->nsec = TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+	int i;
+	char *uti_str;
+	int uti_val;
+	int st;
+	pid_t pid;
+	long max;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+	int fd;
+	key_t key = ftok(argv[0], 0);
+	int shmid;
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "+", options, NULL)) != -1) {
+		switch (opt) {
+		default: /* '?' */
+			printf("unknown option: %c\n", optopt);
+			exit(1);
+		}
+	}
+
+	fprintf(stdout, "CT09001 MPI progress thread skelton START\n");
+
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09002 main running on Linux INFO\n");
+	else {
+		fprintf(stdout, "CT09002 main running on McKernel INFO\n");
+	}
+
+	fwq_init();
+
+#define SHMPOSIX 1
+#define SHMSYSV 2
+#define SHMANON 3
+#define SHM_METHOD SHMPOSIX
+#if SHM_METHOD==SHMPOSIX
+	printf("posix1\n");
+	if((fd = shm_open("/CT27", O_RDWR | O_CREAT, 0644)) == -1) {
+		fprintf(stdout, "shm_open failed\n");
+	}
+	if(ftruncate(fd, sizeof(struct proc_glb))) {
+		fprintf(stdout, "ftruncate failed\n");
+	}
+	proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "mmap failed\n");
+		exit(1);
+	}
+#elif SHM_METHOD==SHMSYSV
+	printf("sysv1\n");
+    if ((shmid = shmget(key, sizeof(struct proc_glb), IPC_CREAT | 0660)) == -1) {
+		fprintf(stdout, "shmget failed: %s\n", strerror(errno));
+	}
+	proc_glb = shmat(shmid, NULL, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "shmat failed\n");
+		exit(1);
+	}
+#elif SHM_METHOD==SHMANON
+	printf("anon1\n");
+	proc_glb = mmap(0, sizeof(struct proc_glb), PROT_READ | PROT_WRITE, MAP_SHARED/* | MAP_ANONYMOUS*/, /*-1*/fd, 0);
+	if (proc_glb == (void*)-1) {
+		fprintf(stdout, "mmap failed\n");
+		exit(1);
+	}
+#endif
+
+	memset(proc_glb, 0, sizeof(struct proc_glb));
+
+	proc_glb->bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	pthread_condattr_setpshared(&condattr, PTHREAD_PROCESS_SHARED);
+	pthread_cond_init(&proc_glb->bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutexattr_setpshared(&mutexattr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(&proc_glb->bar_lock, &mutexattr);
+
+	for (i = 0; i < NPROC; i++) {
+		proc_glb->per_procs[i].rank = i;
+		printf("[0] i=%d,rank=%d\n", i, proc_glb->per_procs[i].rank);
+	}
+	for (i = 1; i < NPROC; i++) {
+		pid = fork();
+		if(pid < 0) {
+			fprintf(stdout, "fork failed: %s\n", strerror(errno));
+			exit(1);
+		} else if (pid == 0) {
+#if SHM_METHOD==SHMSYSV
+	printf("sysv2\n");
+			proc_glb = shmat(shmid, NULL, 0);
+#endif
+			printf("[%d] rank=%d\n", i, proc_glb->per_procs[i].rank);
+			parent_fn(&proc_glb->per_procs[i]);
+			exit(0);
+		}
+	}
+	parent_fn(&proc_glb->per_procs[0]);
+	
+	while ((pid = waitpid(-1, &st, __WALL)) > 0);
+
+	max = -1;
+	for (i = 0; i < NPROC; i++) {
+		if (max < proc_glb->per_procs[i].nsec) {
+			max = proc_glb->per_procs[i].nsec;
+		}
+	}
+
+	fprintf(stderr, "max %ld nsec\n", max);
+	fprintf(stdout, "CT09006 END\n");
+}
+
diff --git a/test/uti/CT28.sh b/test/uti/CT28.sh
new file mode 100755
index 00000000..7ccef343
--- /dev/null
+++ b/test/uti/CT28.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/bash -x
+MYHOME="/work/gg10/e29005"
+MCK="${MYHOME}/project/os/install"
+MCEXECOPT="--enable-uti"
+export DISABLE_UTI=0
+
+stop=0
+reset=0
+go=0
+mck=0;
+loop=0
+nodes="c[8195]"
+NPROC=8
+
+while getopts srglamd OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reset=1
+                ;;
+            g) go=1
+                ;;
+	    l) loop=1
+		;;
+            m) 
+		mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    cpus="0-7"
+    NUMACTL=
+else
+    MCEXEC=
+    cpus="2-9"
+    NUMACTL="numactl -C $cpus"
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcstop+release.sh
+fi
+
+if [ ${reset} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+    sudo mount /work
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w ${nodes} \
+	sudo ${MCK}/sbin/mcreboot.sh `${HOME}/project/src/tools/cpus.pl $NPROC` -m 32G@0,12G@1
+    #sudo ${MCK}/sbin/mcreboot.sh -c 2-17,20-35,36-51,52-67 -r 2-5:0+6-9:1+10-13:68+14-17:69+20-23:136+24-27:137+28-31:204+32-35:205+36-39:18+40-43:19+44-47:86+48-51:87+52-55:154+56-59:155+60-63:222+64-67:223 -m 32G@0,12G@1
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd $MYHOME/project/os/mckernel/test/uti
+    rm -f ./CT28
+    make -DNPROC=$NPROC
+
+    if [ ${loop} -eq 1 ]; then
+	> ./log
+	for i in {1..10}; do (${MCEXEC} ${MCEXECOPT} $NUMACTL ./CT28 1> ./log1 2>> ./log); done
+	perl CT11.pl < ./log
+    else
+	${MCEXEC} ${MCEXECOPT} $NUMACTL ./CT28
+    fi
+fi
diff --git a/test/uti/CT29.c b/test/uti/CT29.c
new file mode 100644
index 00000000..d5f15ca7
--- /dev/null
+++ b/test/uti/CT29.c
@@ -0,0 +1,117 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <linux/futex.h>
+#include <sys/time.h>
+#include <string.h>
+#include <signal.h>
+
+int passed = 0;
+pthread_t thr;
+
+unsigned long mem; /* delay functions issue ld/st instructions on this address */
+double nspw; /* nsec per work */
+
+/* Timer related macros */
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+#define N_INIT 1000000
+
+void fwq_init() {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) {
+		printf("%s: delay_nsec<0\n", __FUNCTION__);
+	}
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	bulk_fsw(delay_nsec / nspw);
+}
+
+void *util_thread(void *arg) {
+	int rc;
+
+	fwq(1000*1000);
+
+	return NULL;
+}
+
+int
+main(int argc, char **argv)
+{
+	int rc;
+	pthread_attr_t attr;
+	struct sigaction act;
+
+	fwq_init();
+
+	fprintf(stderr, "CT29001 INFO start (tid=%d)\n", syscall(__NR_gettid));
+	rc = syscall(731, 1, NULL);
+	if (rc) {
+		fprintf(stderr, "CT29002 INFO uti not supported (rc=%d, errno=%d)\n", rc, errno);
+		fflush(stderr);
+	}
+
+	rc = pthread_attr_init(&attr);
+	if (rc){
+		fprintf(stderr, "pthread_attr_init: %d\n", rc);
+		exit(1);
+	}
+#if 1
+	rc = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+	if (rc){
+		fprintf(stderr, "pthread_attr_setdetachstate: %d\n", rc);
+		exit(1);
+	}
+#endif
+	rc = pthread_create(&thr, &attr, util_thread, NULL);
+	if (rc){
+		fprintf(stderr, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	fprintf(stderr, "CT29003 pthread_create OK\n");
+
+	fwq(100*1000*1000);
+
+#if 0
+	pthread_join(thr, NULL);
+	fprintf(stderr, "CT29004 pthread_join OK\n");
+#endif
+	exit(0);
+}
diff --git a/test/uti/CT30.c b/test/uti/CT30.c
new file mode 100644
index 00000000..34a97ef0
--- /dev/null
+++ b/test/uti/CT30.c
@@ -0,0 +1,177 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include "util.h"
+
+#define NTHR 1
+#define TS2NS(sec, nsec) ((unsigned long)(sec) * 1000000000ULL + (unsigned long)(nsec))
+#define CALC_DELAY (93000) /* 93   usec */
+#define INIT_DELAY  (2000) /*  2   usec, CPU sends CTS packet */
+#define NIC_DELAY   (3000) /*  3   usec, NIC reads by RDMA-read  */
+#define POLL_DELAY  (200) /*    .2 usec, CPU fetces event queue entry from DRAM */
+#define RESP_DELAY  (2000) /*  2   usec, CPU sends DONE packet and updates MPI_Request */
+#define NSPIN 1
+static inline void FIXED_SIZE_WORK(unsigned long *ptr) {
+	asm volatile("movq %0, %%rax\n\t" 
+				 "addq $1, %%rax\n\t"			\
+				 "movq %%rax, %0\n\t"			\
+				 : "+rm" (*ptr)						\
+				 :									\
+				 : "rax", "cc", "memory");			\
+}
+
+static inline void BULK_FSW(unsigned long n, unsigned long *ptr) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		FIXED_SIZE_WORK(ptr); 
+	} 
+}
+
+
+pthread_mutex_t ep_lock; /* Ownership of channel instance */
+pthread_barrier_t bar;
+
+struct thr_arg {
+	pthread_t pthread;
+	unsigned long mem; /* Per-thread storage */
+};
+
+struct thr_arg thr_args[NTHR];
+
+unsigned long mem; /* Per-thread storage */
+volatile int nevents;
+volatile int terminate;
+int wps = 1; /* work per sec */
+double nspw; /* nsec per work */
+
+#define N_INIT 10000000
+
+void fwq_init(unsigned long *mem) {
+	struct timespec start, end;
+	unsigned long nsec;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	BULK_FSW(N_INIT, mem);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec));
+	nspw = nsec / (double)N_INIT;
+	printf("[INFO] nsec=%ld, nspw=%f\n", nsec, nspw);
+}
+
+void fwq(unsigned long delay_nsec, unsigned long* mem) {
+	//printf("delay_nsec=%ld,count=%f\n", delay_nsec, delay_nsec / nspw);
+	BULK_FSW(delay_nsec / nspw, mem);
+}
+
+void fwq_omp(unsigned long delay_nsec, unsigned long* mem) {
+#pragma omp parallel
+	{
+		BULK_FSW(delay_nsec / nspw, mem);
+	}
+}
+
+void mydelay(long delay_nsec, long *mem) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec) > delay_nsec) {
+			break;
+		}
+		FIXED_SIZE_WORK(mem);
+	}
+}
+
+void *util_fn(void *_arg) {
+	struct thr_arg *arg = (struct thr_arg *)_arg;
+	int ret;
+	int i;
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret == -1, "util_fn running on Linux, tid=%d\n", syscall(SYS_gettid));
+
+	pthread_barrier_wait(&bar);
+
+	/* Start progress */
+	while (1) {
+		pthread_mutex_lock(&ep_lock);
+		if (terminate) {
+			pthread_mutex_unlock(&ep_lock);
+			break;
+		}
+
+		if (nevents > 0) {
+			nevents--;
+			fwq(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		}
+		pthread_mutex_unlock(&ep_lock);
+	}
+
+ fn_fail:
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int ret;
+	int i;
+	struct timespec start, end;
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret != -1, "Master is running on McKernel\n");
+
+	fwq_init(&mem);
+	pthread_mutex_init(&ep_lock, NULL);
+
+	pthread_barrier_init(&bar, NULL, NTHR + 1);
+
+	if ((ret = syscall(731, 1, NULL))) {
+		fprintf(stdout, "Error: util_indicate_clone: %s\n", strerror(errno));
+	}
+
+	for (i = 0; i < NTHR; i++) {
+		if ((ret = pthread_create(&thr_args[i].pthread, NULL, util_fn, &thr_args[i]))) {
+			fprintf(stdout, "Error: pthread_create: %s\n", strerror(errno));
+			exit(1);
+		}
+	}
+
+	pthread_barrier_wait(&bar);
+
+#pragma omp parallel for
+	for (i = 0; i < omp_get_num_threads(); i++) {
+		printf("[INFO] thread_num=%d,tid=%d\n", i, syscall(SYS_gettid));
+	}
+
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < 10; i++) {
+		pthread_mutex_lock(&ep_lock);
+		nevents++;
+		fwq_omp(random() % 100000000, &mem); /* 0 - 0.1 sec */
+		pthread_mutex_unlock(&ep_lock);
+
+		while (nevents > 0) {
+			FIXED_SIZE_WORK(&mem);
+		}
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	
+	terminate = 1;
+	
+	for (i = 0; i < NTHR; i++) {
+		pthread_join(thr_args[i].pthread, NULL);
+	}
+
+	printf("[INFO] Time: %ld usec\n", (TS2NS(end.tv_sec, end.tv_nsec) - TS2NS(start.tv_sec, start.tv_nsec)) / 1000);
+
+	ret = 0;
+ fn_fail:
+	return ret;
+}
diff --git a/test/uti/CT30.sh b/test/uti/CT30.sh
new file mode 100755
index 00000000..fc0198fa
--- /dev/null
+++ b/test/uti/CT30.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/bash
+
+bn=`basename $0`
+fn=`echo $bn | sed 's/.sh//'`
+
+stop=0
+reboot=0
+go=0
+mck=0
+NNODES=1
+NPROC=$((16 * NNODES))
+LASTNODE=8200
+
+while getopts srgmN:P:L: OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reboot=1
+                ;;
+            g) go=1
+                ;;
+            m) mck=1
+                ;;
+	    N) NNODES=$OPTARG
+		;;
+	    P) NPROC=$OPTARG
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+MYHOME=/work/gg10/e29005
+ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti
+MCK=${MYHOME}/project/os/install
+
+NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'`
+PPN=$((NPROC / NNODES))
+echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    mcexecopt="--enable-uti"
+else
+    MCEXEC=
+    mcexecopt=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    /sbin/pidof mcexec \| xargs -r kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+    sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd $ABS_SRCDIR
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -u 16384; 
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -s unlimited
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -c unlimited
+
+    export KMP_STACKSIZE=64M
+    export OMP_NUM_THREADS=4
+
+    $MCEXEC $mcexecopt ./$fn
+fi
+
diff --git a/test/uti/CT31.c b/test/uti/CT31.c
new file mode 100644
index 00000000..e5f839de
--- /dev/null
+++ b/test/uti/CT31.c
@@ -0,0 +1,158 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <signal.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include "util.h"
+
+#define WAITER_CPU 0
+#define WAKER_CPU 1
+
+pthread_mutex_t mutex;
+pthread_cond_t cond;
+pthread_barrier_t bar;
+int flag;
+pthread_t thr;
+long t_cond_wait, t_fwq;
+long nloop;
+long blocktime = 10L * 1000 * 1000;
+
+void *util_fn(void *arg)
+{
+	int i;
+	int ret;
+    long start, end;
+
+	print_cpu_last_executed_on("Utility thread");
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n");
+
+	pthread_barrier_wait(&bar);
+	for (i = 0; i < nloop; i++) {
+		start = rdtsc_light();
+
+		fwq(blocktime);
+
+		end = rdtsc_light();
+		t_fwq += end - start;
+
+		pthread_mutex_lock(&mutex);
+		flag = 1;
+		pthread_cond_signal(&cond);
+		pthread_mutex_unlock(&mutex);
+		
+	}
+
+ fn_fail:
+	return NULL;
+}
+
+static struct option options[] = {
+	/* end */
+	{ NULL, 0, NULL, 0, }
+};
+
+int main(int argc, char **argv)
+{
+	int i;
+	int ret;
+    long start, end;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+	pthread_barrierattr_t bar_attr;
+	struct sched_param param = { .sched_priority = 99 };
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'b':
+				blocktime = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+	nloop = (10 * 1000000000UL) / blocktime;
+	printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime);
+
+	
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAITER_CPU, &cpuset);
+	if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("Error: sched_setaffinity: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+	print_cpu_last_executed_on("Master thread");
+
+	fwq_init();
+
+	pthread_mutex_init(&mutex, NULL);
+	pthread_cond_init(&cond, NULL);
+
+	pthread_barrierattr_init(&bar_attr);
+	pthread_barrier_init(&bar, &bar_attr, 2);
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n");
+
+	ret = syscall(731, 1, NULL);
+	OKNGNOJUMP(ret != -1, "util_indicate_clone\n");
+
+	if ((ret = pthread_attr_init(&attr))) {
+ 		printf("%s: Error: pthread_attr_init failed (%d)\n", __FUNCTION__, ret);
+		goto fn_fail;
+	}
+
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAKER_CPU, &cpuset);
+
+	if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("%s: Error: pthread_attr_setaffinity_np failed (%d)\n", __FUNCTION__, ret);
+		goto fn_fail;
+	}
+
+	if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) {
+		fprintf(stderr, "Error: pthread_create failed (%d)\n", ret);
+		goto fn_fail;
+	}
+
+	if ((ret = sched_setscheduler(0, SCHED_FIFO, &param))) {
+		fprintf(stderr, "Error: sched_setscheduler failed (%d)\n", ret);
+		goto fn_fail;
+	}
+
+	syscall(701, 1 | 2);
+	pthread_barrier_wait(&bar);
+	for (i = 0; i < nloop; i++) {
+		start = rdtsc_light();
+		
+		pthread_mutex_lock(&mutex); /* no futex */
+		while(!flag) {
+			pthread_cond_wait(&cond, &mutex); /* 1st futex */
+		}
+		flag = 0;
+		pthread_mutex_unlock(&mutex); /* 2nd futex */
+
+		end = rdtsc_light();
+		t_cond_wait += end - start;
+	}
+	syscall(701, 4 | 8);
+
+	pthread_join(thr, NULL);
+	printf("[INFO] waker: %ld cycles, waiter: %ld cycles, (waiter - waker) / nloop: %ld cycles\n", t_fwq, t_cond_wait, (t_cond_wait - t_fwq) / nloop);
+
+	ret = 0;
+ fn_fail:
+	return ret;
+}
diff --git a/test/uti/CT31.sh b/test/uti/CT31.sh
new file mode 100755
index 00000000..8a6a6e28
--- /dev/null
+++ b/test/uti/CT31.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/bash
+
+bn=`basename $0`
+fn=`echo $bn | sed 's/.sh//'`
+
+nloop=800
+stop=0
+reboot=0
+go=0
+mck=0
+NNODES=1
+NPROC=$((1 * NNODES))
+LASTNODE=8200
+use_hfi=0
+
+while getopts srgmh:N:P:L: OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reboot=1
+                ;;
+            g) go=1
+                ;;
+            m) mck=1
+                ;;
+	    h) use_hfi=1
+		;;
+	    N) NNODES=$OPTARG
+		;;
+	    P) NPROC=$OPTARG
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+MYHOME=/work/gg10/e29005
+ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti
+MCK=${MYHOME}/project/os/install
+
+NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'`
+PPN=$((NPROC / NNODES))
+echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    mcexecopt="--enable-uti"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+else
+    MCEXEC=
+    mcexecopt=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    /sbin/pidof mcexec \| xargs -r kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+    sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd $ABS_SRCDIR
+    make $fn
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -u 16384; 
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -s unlimited
+
+    for((count=0;count<nloop;count++)); do
+	sudo $MCEXEC $mcexecopt ./$fn
+	echo =====
+	echo $count
+	echo =====
+    done
+
+fi
+
diff --git a/test/uti/CT32.c b/test/uti/CT32.c
new file mode 100644
index 00000000..432ad91e
--- /dev/null
+++ b/test/uti/CT32.c
@@ -0,0 +1,191 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <uti.h>
+#include "util.h"
+
+#define WAITER_CPU 0
+#define WAKER_CPU 1
+
+int sem;
+pthread_barrier_t bar;
+int flag;
+pthread_t thr;
+long t_futex_wait, t_fwq;
+long nloop;
+long blocktime = 10L * 1000 * 1000;
+
+void *util_fn(void *arg)
+{
+	int i;
+	int ret;
+    long start, end;
+	int testid = 32101;
+
+	print_cpu_last_executed_on("Utility thread");
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n");
+
+	pthread_barrier_wait(&bar);
+
+	for (i = 0; i < nloop; i++) {
+		start = rdtsc_light();
+
+		fwq(blocktime);
+
+		end = rdtsc_light();
+		t_fwq += end - start;
+
+		if ((ret = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0)) == -1) {
+			printf("Error: futex wake: %s\n", strerror(errno));
+		}
+
+		//pthread_barrier_wait(&bar);
+
+	}
+
+	ret = 0;
+ fn_fail:
+	return NULL;
+}
+
+static struct option options[] = {
+	/* end */
+	{ NULL, 0, NULL, 0, }
+};
+
+int main(int argc, char **argv)
+{
+	int i;
+	int ret;
+    long start, end;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+	pthread_barrierattr_t bar_attr;
+	struct sched_param param = { .sched_priority = 99 };
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'b':
+				blocktime = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+	nloop = (10 * 1000000000UL) / blocktime;
+	printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime);
+
+	
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAITER_CPU, &cpuset);
+	if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("Error: sched_setaffinity: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+	print_cpu_last_executed_on("Master thread");
+
+	fwq_init();
+
+	pthread_barrierattr_init(&bar_attr);
+	pthread_barrier_init(&bar, &bar_attr, 2);
+
+	if ((ret = pthread_attr_init(&attr))) {
+ 		printf("Error: pthread_attr_init: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+#if 0
+	uti_attr_t uti_attr;
+	ret = uti_attr_init(&uti_attr);
+	if (ret) {
+		printf("%s: Error: uti_attr_init failed (%d)\n", __FUNCTION__, ret);
+		exit(1);
+	}
+
+	/* Give a hint that it's beneficial to prioritize it in scheduling. */
+	ret = UTI_ATTR_HIGH_PRIORITY(&uti_attr);
+	if (ret) {
+		printf("%s: Error: UTI_ATTR_HIGH_PRIORITY failed (%d)\n", __FUNCTION__, ret);
+		exit(1);
+	}
+	
+	if ((ret = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED))) {
+		printf("%s: Error: pthread_attr_setdetachstate failed (%d)\n", __FUNCTION__, ret);
+		exit(1);
+	}
+	
+	if ((ret = uti_pthread_create(&thr, &attr, progress_function, NULL, &uti_attr))) {
+		printf("%s: Error: uti_pthread_create: %s\n", __FUNCTION__, strerror(errno));
+		exit(1);
+	}
+	
+	if ((ret = uti_attr_destroy(&uti_attr))) {
+		printf("%s: Error: uti_attr_destroy failed (%d)\n", __FUNCTION__, ret);
+		exit(1);
+	}
+#else
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAKER_CPU, &cpuset);
+
+	if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("Error: pthread_attr_setaffinity_np: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n");
+
+	ret = syscall(731, 1, NULL);
+	OKNGNOJUMP(ret != -1, "util_indicate_clone\n");
+
+	if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) {
+		printf("Error: pthread_create: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+#endif
+
+	if ((ret = sched_setscheduler(0, SCHED_FIFO, &param))) {
+		printf("Error: sched_setscheduler: %s\n", strerror(errno));
+		ret = -errno;
+		goto fn_fail;
+	}
+
+	syscall(701, 1 | 2);
+	pthread_barrier_wait(&bar);
+	start = rdtsc_light();
+	for (i = 0; i < nloop; i++) {
+		
+		if ((ret = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0))) {
+			printf("Error: futex wait failed (%s)\n", strerror(errno));
+		}
+
+		//pthread_barrier_wait(&bar); /* 2nd futex */
+	}
+	end = rdtsc_light();
+	t_futex_wait += end - start;
+	syscall(701, 4 | 8);
+
+	pthread_join(thr, NULL);
+	printf("[INFO] waiter: %ld cycles, waker: %ld cycles, (waiter - waker) / nloop: %ld cycles\n", t_fwq, t_futex_wait, (t_futex_wait - t_fwq) / nloop);
+
+	ret = 0;
+ fn_fail:
+	return ret;
+}
diff --git a/test/uti/CT32.sh b/test/uti/CT32.sh
new file mode 100755
index 00000000..854cc27f
--- /dev/null
+++ b/test/uti/CT32.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/bash
+
+bn=`basename $0`
+fn=`echo $bn | sed 's/.sh//'`
+
+stop=0
+reboot=0
+go=0
+mck=0
+disable_uti=1
+NNODES=1
+NPROC=$((1 * NNODES))
+LASTNODE=8200
+use_hfi=0
+
+while getopts srgmh:N:P:L:d: OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reboot=1
+                ;;
+            g) go=1
+                ;;
+            m) mck=1
+                ;;
+	    h) use_hfi=1
+		;;
+            d) disable_uti=$OPTARG
+                ;;
+	    N) NNODES=$OPTARG
+		;;
+	    P) NPROC=$OPTARG
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+MYHOME=/work/gg10/e29005
+ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti
+MCK=${MYHOME}/project/os/install
+
+NODES=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'`
+PPN=$((NPROC / NNODES))
+echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN NODES=$NODES
+
+if [ $disable_uti -eq 1 ]; then
+    export DISABLE_UTI=1
+else
+    unset DISABLE_UTI
+fi
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    mcexecopt="--enable-uti"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+else
+    MCEXEC=
+    mcexecopt=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    /sbin/pidof mcexec \| xargs -r kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+    sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	    sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd $ABS_SRCDIR
+    make $fn
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -u 16384; 
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $NODES \
+	ulimit -s unlimited
+
+    sudo $MCEXEC $mcexecopt ./$fn
+fi
+
diff --git a/test/uti/CT33.c b/test/uti/CT33.c
new file mode 100644
index 00000000..7a2a9f96
--- /dev/null
+++ b/test/uti/CT33.c
@@ -0,0 +1,167 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <pthread.h>
+#include <errno.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <linux/futex.h>
+#include <getopt.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include "util.h"
+
+#define WAITER_CPU 0
+#define WAKER_CPU 1
+
+int sem;
+pthread_barrier_t bar;
+int flag;
+pthread_t thr;
+long t_fwq, t_futex_wake, t_futex_wait;
+long t_fwq2;
+long nloop;
+long blocktime = 10 * 1000 * 1000L;
+
+void *util_fn(void *arg)
+{
+	int i;
+	int ret;
+    long start, end;
+    long start2, end2;
+
+	print_cpu_last_executed_on("Utility thread");
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n");
+
+	/* Measure fwq time */
+	start = rdtsc_light();
+	for (i = 0; i < nloop; i++) {
+		fwq(blocktime);
+	}
+	end = rdtsc_light();
+	t_fwq2 += end - start;
+
+	/* Measure fwq + futex time */
+	syscall(701, 1 | 2 | 0x80000000);
+	pthread_barrier_wait(&bar);
+	start = rdtsc_light();
+	for (i = 0; i < nloop; i++) {
+		start2 = rdtsc_light();
+
+		fwq(blocktime);
+
+		end2 = rdtsc_light();
+		t_fwq += end2 - start2;
+
+		if ((ret = syscall(__NR_futex, &sem, FUTEX_WAKE, 1, NULL, NULL, 0)) != 1) {
+			printf("Error: futex wake failed (%d,%s)\n", ret, strerror(errno));
+		}
+
+		//pthread_barrier_wait(&bar);
+	}
+	end = rdtsc_light();
+	t_futex_wake += end - start;
+
+	syscall(701, 4 | 8 | 0x80000000);
+
+ fn_fail:
+	return NULL;
+}
+
+static struct option options[] = {
+	/* end */
+	{ NULL, 0, NULL, 0, }
+};
+
+int main(int argc, char **argv)
+{
+	int i, j;
+	int ret;
+	long start, end;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+	pthread_barrierattr_t bar_attr;
+	struct sched_param param = { .sched_priority = 99 };
+	int opt;
+
+	while ((opt = getopt_long(argc, argv, "+b:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'b':
+				blocktime = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+	nloop = 10 * 1000000000UL / blocktime;
+	printf("[INFO] nloop=%ld,blocktime=%ld\n", nloop, blocktime);
+
+	
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAITER_CPU, &cpuset);
+	if ((ret = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("Error: sched_setaffinity: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+	print_cpu_last_executed_on("Master thread");
+
+	fwq_init();
+
+	pthread_barrierattr_init(&bar_attr);
+	pthread_barrier_init(&bar, &bar_attr, 2);
+
+	ret = syscall(732);
+	OKNGNOJUMP(ret != -1, "Master thread is running on McKernel\n");
+
+	ret = syscall(731, 1, NULL);
+	OKNGNOJUMP(ret != -1, "util_indicate_clone\n");
+
+	if ((ret = pthread_attr_init(&attr))) {
+ 		printf("Error: pthread_attr_init failed: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+ 	CPU_ZERO(&cpuset);
+	CPU_SET(WAKER_CPU, &cpuset);
+
+	if ((ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset))) {
+ 		printf("Error: pthread_attr_setaffinity_np: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+	if ((ret = pthread_create(&thr, &attr, util_fn, NULL))) {
+		printf("Error: pthread_create: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+	if ((ret = sched_setscheduler(0, SCHED_FIFO, &param))) {
+		printf("Error: sched_setscheduler: %s\n", strerror(errno));
+		goto fn_fail;
+	}
+
+	pthread_barrier_wait(&bar);
+	start = rdtsc_light();
+	for (i = 0; i < nloop; i++) {
+		
+		if ((ret = syscall(__NR_futex, &sem, FUTEX_WAIT, 0, NULL, NULL, 0))) {
+			printf("Error: futex wait: %s\n", strerror(errno));
+		}
+
+		//pthread_barrier_wait(&bar);
+	}
+	end = rdtsc_light();
+	t_futex_wait += end - start;
+
+	pthread_join(thr, NULL);
+	printf("[INFO] compute: %ld, wake: %ld, wait: %ld, wake - compute: %ld, wait - compute: %ld (cycles)\n", t_fwq, t_futex_wake, t_futex_wait, (t_futex_wake - t_fwq) / nloop, (t_futex_wait - t_fwq) / nloop);
+
+ fn_fail:
+	return ret;
+}
diff --git a/test/uti/CT33.sh b/test/uti/CT33.sh
new file mode 100755
index 00000000..5c83ba1c
--- /dev/null
+++ b/test/uti/CT33.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/bash
+
+bn=`basename $0`
+fn=`echo $bn | sed 's/.sh//'`
+
+stop=0
+reboot=0
+go=0
+mck=0
+NNODES=1
+NPROC=$((1 * NNODES))
+LASTNODE=8200
+use_hfi=0
+
+while getopts srgmh:N:P:L: OPT
+do
+        case ${OPT} in
+	    s) stop=1
+		;;
+            r) reboot=1
+                ;;
+            g) go=1
+                ;;
+            m) mck=1
+                ;;
+	    h) use_hfi=1
+		;;
+	    N) NNODES=$OPTARG
+		;;
+	    P) NPROC=$OPTARG
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+MYHOME=/work/gg10/e29005
+ABS_SRCDIR=${MYHOME}/project/os/mckernel/test/uti
+MCK=${MYHOME}/project/os/install
+
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $NNODES)) $LASTNODE) | sed 's/^/c/'`
+PPN=$((NPROC / NNODES))
+echo NPROC=$NPROC NNODES=$NNODES PPN=$PPN nodes=$nodes
+
+if [ "`cat /etc/mtab | while read line; do cut -d" " -f 2; done | grep /work`" == "" ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
+fi
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+    mcexecopt="--enable-uti"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+else
+    MCEXEC=
+    mcexecopt=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /sbin/pidof mcexec \| xargs -r kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${MCK}/sbin/mcreboot.sh -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd $ABS_SRCDIR
+    make $fn
+
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	ulimit -u 16384; 
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	ulimit -s unlimited
+
+    sudo $MCEXEC $mcexecopt ./$fn
+fi
+
diff --git a/test/uti/CT34.c b/test/uti/CT34.c
new file mode 100644
index 00000000..f4c8a98b
--- /dev/null
+++ b/test/uti/CT34.c
@@ -0,0 +1,62 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <errno.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/syscall.h>
+#include <sys/mman.h>
+#include <sched.h>
+#include "util.h"
+
+void *util_fn(void *arg)
+{
+	int ret;
+	ret = syscall(732);
+	OKNGNOJUMP(ret == -1, "Utility thread is running on Linux\n");
+ fn_fail:
+	return NULL;
+}
+
+int my_thread_create()
+{
+	pthread_t thr;
+	int ret = 0;
+
+	ret = syscall(731, 1, NULL);
+	OKNGNOJUMP(ret == 0, "util_indicate_clone,ret=%d,errno=%d\n", ret, errno);
+
+	if ((ret = pthread_create(&thr, NULL, util_fn, NULL))) {
+		printf("Error: pthread_create: %s\n", strerror(errno));
+	}
+	
+	if ((ret = pthread_join(thr, NULL))) {
+		printf("Error: pthread_join: %s\n", strerror(errno));
+	}
+
+ fn_exit:
+	return ret;
+
+ fn_fail:
+	ret = -1;
+	goto fn_exit;
+}
+
+int
+main(int argc, char **argv)
+{
+	int ret = 0;
+
+	if ((ret = my_thread_create())) {
+		printf("Error: my_thread_create,ret=%d\n", ret);
+	}
+
+ fn_exit:
+	return ret;
+
+ fn_fail:
+	ret = -1;
+	goto fn_exit;
+}
diff --git a/test/uti/CT35.sh b/test/uti/CT35.sh
new file mode 100755
index 00000000..48bac59f
--- /dev/null
+++ b/test/uti/CT35.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/bash
+
+mck_dir=/work/gg10/e29005/project/os/install
+nloop=800
+exe=CT26
+mck=1
+
+mcexec="${mck_dir}/bin/mcexec"
+
+sudo ${mck_dir}/sbin/mcstop+release.sh
+sudo ${mck_dir}/sbin/mcreboot.sh -c 1,2,3 -m 512M
+
+ulimit -c unlimited
+
+for((count=0;count<nloop;count++)); do
+    if [ $mck -eq 1 ]; then
+	export MCKERNEL_LD_PRELOAD=./preloadlib.so
+	#    $mcexec --enable-uti ./$exe
+#	$mcexec gdb -batch -ex "run" -ex "bt" ./$exe
+	$mcexec ./$exe
+    else
+	export LD_PRELOAD=./preloadlib.so
+	./$exe
+    fi
+
+    rc=$?
+    if [ $rc -ne 0 ]; then
+	echo mcexec returned $rc
+	exit
+    fi
+
+    echo =====
+    echo $count
+    echo =====
+    
+done
diff --git a/test/uti/Makefile b/test/uti/Makefile
new file mode 100644
index 00000000..2b0be066
--- /dev/null
+++ b/test/uti/Makefile
@@ -0,0 +1,46 @@
+.SUFFIXES:	# Disable implicit rules
+
+SYSCALLL_INTERCEPT_DIR=$(HOME)/usr
+UTI_DIR=$(HOME)/project/uti/install
+
+CC = gcc
+
+CPPFLAGS = -I$(UTI_DIR)/include
+CCFLAGS = -g -O0
+LDFLAGS = -L$(UTI_DIR)/lib -Wl,-rpath,$(UTI_DIR)/lib -luti -lpthread -lrt 
+
+SRCS = $(shell ls CT*.c)
+EXES = $(SRCS:.c=)
+OBJS = $(SRCS:.c=.o)
+
+CFLAGS_SO = -g -O2 -I$(SYSCALLL_INTERCEPT_DIR)/include
+LDFLAGS_SO = -L$(SYSCALLL_INTERCEPT_DIR)/lib64 -Wl,-rpath,$(SYSCALLL_INTERCEPT_DIR)/lib64 -lsyscall_intercept -fpic -shared
+SO_SRCS = preloadlib.c
+SOS = $(SO_SRCS:.c=.so)
+
+
+all: $(EXES) file $(SOS)
+
+file::
+	dd bs=4096 count=1000 if=/dev/zero of=./file
+
+CT30.o:: CT30.c
+	icc $(CCFLAGS) -qopenmp $(CPPFLAGS) -c $<
+
+CT30: CT30.o
+	icc -o $@ $^ $(LDFLAGS) -qopenmp
+
+%.o:: %.c
+	$(CC) $(CCFLAGS) $(CPPFLAGS) -c $<
+
+%: %.o util.o
+	$(CC) -o $@ $^ $(LDFLAGS)
+
+util.o:: util.c
+	$(CC) $(CCFLAGS) $(CPPFLAGS) -c $<
+
+preloadlib.so: preloadlib.c
+	$(CC) $(CFLAGS_SO) $(LDFLAGS_SO) $^ -o $@ 
+
+clean :
+	rm -f core $(EXES) $(OBJS)
diff --git a/test/uti/README b/test/uti/README
new file mode 100644
index 00000000..b3ce5467
--- /dev/null
+++ b/test/uti/README
@@ -0,0 +1,322 @@
+Linuxへのスレッド生成
+結合テスト仕様
+
+CT01 システムコールテスト mmap/munmap/futex/exit
+□ CT01001 mmap/munmap/futex/exit START
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT01002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT01003 get_system OK
+   mmap 発行。戻り値が (void *)-1 以外
+□ CT01004 mmap OK
+   mmap 領域に "mmap OK" を書き込む
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドにて、mmap 領域参照 (mmap 領域の内容を表示)
+□ CT01005 mmap OK
+   メインスレッドからLinuxスレッドに cond_signal
+   Linux スレッドにて mmap 領域を munmap。戻り値が 0
+□ CT01006 munmap OK
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドが mmap 領域を参照 -> SIGSEGV発生
+□ CT01007 munmap OK (SIGSEGV)
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT01008 exit (pthread_join) OK
+□ CT01009 futex (pthread_mutex/pthread_cond) OK
+□ CT01010 END
+
+CT02 システムコールテスト mremap
+□ CT02001 mremap START
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT02002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT02003 get_system OK
+   mmap 発行。戻り値が (void *)-1 以外
+□ CT02004 mmap OK
+   mmap 領域の縮小予定の領域に "mmap OK" を書き込む
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドにて、mmap 領域の書き込んだ領域を参照 (mmap 領域の内容を表示)
+□ CT02005 mmap OK
+   メインスレッドからLinuxスレッドに cond_signal
+   Linux スレッドにて mmap 領域を mremap して縮小。戻り値が 0
+□ CT02006 mremap OK
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドが mmap 領域の縮小した領域を参照 -> SIGSEGV発生
+□ CT02007 mremap OK (SIGSEGV)
+   メインスレッドからLinuxスレッドに cond_signal
+   Linux スレッドにて mmap 領域を munmap。戻り値が 0
+□ CT02008 munmap OK
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT02009 pthread_join OK
+□ CT02010 END
+
+CT03 システムコールテスト mprotect
+□ CT03001 mprotect START
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT03002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT03003 get_system OK
+   mmap 発行。戻り値が (void *)-1 以外
+□ CT03004 mmap OK
+   mmap 領域に "mmap OK" を書き込む
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドにて、mmap 領域を参照 (mmap 領域の内容を表示)
+□ CT03005 mmap OK
+   メインスレッドからLinuxスレッドに cond_signal
+   Linux スレッドにて mmap 領域を mprotect して参照権のみ設定。戻り値が 0
+□ CT03006 mprotect OK
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドが mmap 領域に書き込み -> SIGSEGV発生
+□ CT03007 mremap OK (SIGSEGV)
+   メインスレッドからLinuxスレッドに cond_signal
+   Linux スレッドにて mmap 領域を munmap。戻り値が 0
+□ CT03008 munmap OK
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT03009 pthread_join OK
+□ CT03010 END
+
+CT04 システムコールテスト brk
+□ CT04001 brk START
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT04002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT04003 get_system OK
+   sbrk(0)発行。戻り値を保存…(A)
+□ CT04004 sbrk OK
+   sbrk(4096)発行。戻り値を保存…(B)
+   (A)の場所に "sbrk OK" を書き込む
+   Linuxスレッドからメインスレッドに対して cond_signal
+   メインスレッドにて、(A) 領域を参照 (領域の内容を表示)
+□ CT04005 sbrk OK
+   メインスレッドにてsbrk(0)発行。戻り値を保存…(C)
+   メインスレッドからLinuxスレッドに cond_signal
+   Linuxスレッドでsbrk(0)発行。戻り値が(C)と一致している
+□ CT04006 sbrk OK
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT04007 pthread_join OK
+□ CT04008 END
+
+CT05 システムコールテスト gettid
+□ CT05001 gettid START
+   McKernelにスレッドを生成 (pthread_create)。成功
+□ CT05002 pthread_create OK
+   get_system() の戻り値が 0 (McKernel で動作)
+□ CT05003 get_system OK
+   gettid() の戻り値を保存…(A)
+□ CT05004 gettid OK %d 
+   util_migrate_inter_kernel 発行。戻り値が 0
+□ CT05005 util_migrate_inter_kernel OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT05006 get_system OK
+   gettid() の戻り値が(A)と一致
+□ CT05007 gettid OK %d 
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT05008 pthread_join OK
+□ CT05009 END
+
+CT06 システムコールテスト exit_group
+□ CT06001 exit_group START
+   fork して子プロセス生成。以下、子プロセスの処理
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT06002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT06003 get_system OK
+   Linuxスレッドが exit_group(99)
+□ CT06004 pthread_join NG が表示されない
+   親プロセスが wait。子プロセスの終了ステータスが 99
+□ CT06004 exit_group OK
+□ CT06005 END
+
+CT07 システムコールテスト エラー系
+□ CT07001 error START
+   Linuxにスレッドを生成 (pthread_create)。成功
+□ CT07002 pthread_create OK
+   get_system() の戻り値が -1 (Linux で動作)
+□ CT07003 get_system OK
+   clone() の戻り値が -1 で errno が ENOSYS
+□ CT07004 clone OK %d 
+   fork() の戻り値が -1 で errno が ENOSYS
+□ CT07005 fork OK %d 
+   vfork() の戻り値が -1 で errno が ENOSYS
+
+□  CT07006 vfork OK %d 
+   execve() の戻り値が -1 で errno が ENOSYS
+※ syscall_interceptの不具合によりvforkはSegmentation faultを起こすため、除外している
+
+□ CT07007 execve OK %d 
+   Linux スレッド終了
+   メインスレッドにて pthread_join。成功
+□ CT07008 pthread_join OK
+□ CT07009 END
+
+CT08 uti_attr_t関連
+uti_attr_t の動作は実行環境によって変化するため、機械的にOK/NGの判断ができない。
+このため、affinityとschedulerを目視確認して、OK/NGを判断して下さい。
+UTI_FLAG_SAME_NUMA_DOMAIN のテストを容易にするため、mcreboot では特定のNUMA
+ドメインにCPUを寄せて下さい。
+また、UTI_FLAG_SAME_L1のテストを容易にするために、論理コアを1つ以上空けるように
+CPUを割り当てて下さい。
+
+sched cpu には Linux に生成したスレッドの sched_getaffinity の結果を表示する。
+sched には同じく sched_getscheduler の結果を表示する。
+
+□ CT08001 UTI_FLAG_NUMA_SET
+   sched cpu に NUMA domain 2 に属すLinux CPU集合が表示されること。
+   sched=0 であること。
+□ CT08002 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU
+   sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。
+   (CT08001 のCPU集合のメンバであること)
+   sched=1 であること。
+□ CT08003 UTI_FLAG_NUMA_SET|UTI_FLAG_EXCLUSIVE_CPU(2)
+   sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。
+   sched cpu は CT08002 とは異なるCPUが表示されていること(ラウンドロビン)。
+   sched=1 であること。
+□ CT08004 UTI_FLAG_SAME_NUMA_DOMAIN
+   sched cpu にMcKernelに割り当てたCPUと同じNUMAドメインに属すLinux CPU集合が
+   表示されること。
+   sched=0 であること。
+□ CT08005 UTI_FLAG_SAME_NUMA_DOMAIN|UTI_FLAG_CPU_INTENSIVE
+   sched cpu に NUMA domain 2 に属すLinux CPUの内、1つが表示されること。
+   (CT08004 のCPU集合のメンバであること)
+   sched=0 であること。
+□ CT08006 UTI_FLAG_DIFFERENT_NUMA_DOMAIN
+   sched cpu にMcKernelに割り当てたCPUと異なるNUMAドメインに属すLinux CPU集合が
+   表示されること。
+   sched=0 であること。
+□ CT08007 UTI_FLAG_DIFFERENT_NUMA_DOMAIN|UTI_FLAG_HIGH_PRIORITY
+   sched cpu にMcKernelに割り当てたCPUと異なるNUMAドメインに属すLinux CPU集合の
+   内、1つが表示されること。(CT08006 のCPU集合のメンバであること)
+   sched=1 であること。
+□ CT08008 UTI_FLAG_SAME_L1
+   sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有するLinuxの
+   CPU集合が表示されること。(McKernelへのCPU割り当て状態に依存するが、2論理コア
+   /物理コアの場合、高々1CPUのみが該当する。該当コアが存在しない場合は、全ての
+   コアが対象となる)。
+   sched=0 であること。
+□ CT08009 UTI_FLAG_SAME_L1|UTI_FLAG_NON_COOPERATIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有するLinuxの
+   CPUの内1つが表示されること。(CT08008のCPU集合のメンバ。但し、CT08008で該当
+   CPUが存在しない場合は、全てのコアが対象になる)。
+   sched=0 であること。
+□ CT08010 UTI_FLAG_SAME_L2
+   sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有するLinuxの
+   CPU集合が表示されること。(McKernelへのCPU割り当て状態に依存するが、2論理コア
+   /物理コアの場合、高々1CPUのみが該当する。該当コアが存在しない場合は、全ての
+   コアが対象となる)。
+   sched=0 であること。
+□ CT08011 UTI_FLAG_SAME_L2|UTI_FLAG_CPU_INTENSIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有するLinuxの
+   CPUの内1つが表示されること。(CT08010のCPU集合のメンバ。但し、CT08010で該当
+   CPUが存在しない場合は、全てのコアが対象になる)。
+   sched=0 であること。
+□ CT08012 UTI_FLAG_SAME_L3
+   sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有するLinuxの
+   CPU集合が表示されること。
+   sched=0 であること。
+□ CT08013 UTI_FLAG_SAME_L3|UTI_FLAG_CPU_INTENSIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有するLinuxの
+   CPUの内1つが表示されること。(CT08012のCPU集合のメンバ)。
+   sched=0 であること。
+□ CT08014 UTI_FLAG_DIFFERENT_L1
+   sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有しない
+   LinuxのCPU集合が表示されること。
+   sched=0 であること。
+□ CT08015 UTI_FLAG_DIFFERENT_L1|UTI_FLAG_CPU_INTENSIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL1キャッシュを共有しない
+   LinuxのCPUの内、1つが表示されること(CT08014のCPU集合のメンバ)。
+   sched=0 であること。
+□ CT08016 UTI_FLAG_DIFFERENT_L2
+   sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有しない
+   LinuxのCPU集合が表示されること。
+   コアが対象となる)。
+   sched=0 であること。
+□ CT08017 UTI_FLAG_DIFFERENT_L2|UTI_FLAG_CPU_INTENSIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL2キャッシュを共有しない
+   LinuxのCPUの内、1つが表示されること(CT08016のCPU集合のメンバ)。
+   sched=0 であること。
+□ CT08018 UTI_FLAG_DIFFERENT_L3
+   sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有しない
+   LinuxのCPU集合が表示されること。
+   sched=0 であること。
+□ CT08019 UTI_FLAG_DIFFERENT_L3|UTI_FLAG_CPU_INTENSIVE
+   sched cpu にMcKernelの親プロセスが実行するCPUとL3キャッシュを共有しない
+   LinuxのCPUの内、1つが表示されること(CT08018のCPU集合のメンバ)。
+   sched=0 であること。
+
+CT09 プログレス処理オーバーヘッド測定
+
+MPI通信処理とMPIプログレス処理とのロック競合を模すことで、MPIプログレス処理の
+オーバーヘッドを測定する。
+
+MPI通信処理のステップは以下の通り。
+(1) 1usの間オブジェクトをロック
+(2) 30usの間計算を行う
+MPIプログレス処理のステップは以下の通り。
+(1) 10msに一回オブジェクトをロック
+(2) 通信が終了したタイミングに重なった場合は2usの処理を行う。そうでない
+    場合は直ちにアンロックする
+
+CT10 pthread_cond_{wait,signal}() [OK]
+
+CT11 measure time of system calls [OK]
+
+CT12 child (helper thread) futex() wait [OK]
+
+CT13 parent futex() wait [OK]
+
+CT14 child pthread_lock wait [OK]
+
+CT15 parent pthread_lock wait [OK]
+
+CT16 child pthread_cond_wait [OK]
+Linuxはcondで起きる。その後mutexで起きたりしない。
+
+CT17 parent pthread_cond_wait [OK]
+McKernelはcondで起きる。その後mutexで起きる。
+
+CT18 child (helper thread) futex() wait with FUTEX_WAIT_BITSET | FUTEX_CLOCK_REALTIME and non-zero timeout [OK]
+
+CT19 child (helper thread) futex() wait with FUTEX_WAIT_BITSET and non-zero timeout [OK]
+
+CT20 child (helper thread) futex() wait with FUTEX_WAIT and non-zero timeout [OK]
+
+CT21 progress-threadのlockタイミングを変化させたテスト
+
+CT22 compute-threadのlockタイミングを変化させたテスト
+
+CT23 progress-threadのcond_waitタイミングを変化させたテスト
+
+CT24 compute-threadのcond_waitタイミングを変化させたテスト
+
+CT25 MPI_Isend()でのプロセス終了時メモリ破壊不具合のスケルトン。パラメタは以下の通り。
+* 1MB x 250 (./CT25 20 250) 
+* 128K x 1024 (./CT25 17 1024)
+
+CT26 終了時レースコンディションのテスト
+* thread->statusがPS_EXITEDの場合もhold_thread()を呼んでデッドロックする不具合のテスト
+
+CT27 プログレス処理オーバーヘッド測定
+* CT09の複数プロセス版。async progressによってオーバーサブスクライブになった場合のオーバーヘッドを測定する。
+
+CT28 taskset -c 0-7 lock-inc-lock x 10000
+
+CT29 no reverse offload
+
+CT30 CT21にopenmpスレッドを追加したテスト
+
+CT31 pthread_cond_waitオーバーヘッド測定
+* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定
+
+CT32 futex waitオーバーヘッド測定
+* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定
+
+CT33 futex wakeオーバーヘッド測定
+* waiterとwakerのCPUは、それぞれ、WAITER_CPU、WAKER_CPUで設定
+
+CT34 繰り返しpthread_create
+
+CT35 LD_PRELOADでsyscall_interceptを用いたsoをつけた場合のテスト
\ No newline at end of file
diff --git a/test/uti/driver/Makefile b/test/uti/driver/Makefile
new file mode 100644
index 00000000..5d0a015b
--- /dev/null
+++ b/test/uti/driver/Makefile
@@ -0,0 +1,12 @@
+obj-m += hello.o
+
+hello-y = driver.o
+
+.PHONY: clean install modules
+
+modules:
+	$(MAKE) -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules
+
+clean:
+	$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
+
diff --git a/test/uti/driver/driver.c b/test/uti/driver/driver.c
new file mode 100644
index 00000000..262aaea9
--- /dev/null
+++ b/test/uti/driver/driver.c
@@ -0,0 +1,80 @@
+/*
+ * This file is created by mixing the following two codes.
+ *
+ * URL: https://www.apriorit.com/dev-blog/195-simple-driver-for-linux-os
+ * Author: Danil Ishkov, Apriorit
+ *
+ * URL: http://www.linuxdevcenter.com/pub/a/linux/2007/07/05/devhelloworld-a-simple-introduction-to-device-drivers-under-linux.html
+ * Author: Valerie Henson <val@nmt.edu>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <asm/uaccess.h>
+#include <asm/errno.h>
+#include <linux/init.h>
+
+static int hello_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int hello_release(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static long hello_ioctl(struct file *file, unsigned int request, unsigned long arg)
+{
+	return 0;
+}
+
+static struct file_operations fops = {
+	.open = hello_open,
+	.release = hello_release,
+	.unlocked_ioctl = hello_ioctl,
+};
+
+static int device_file_major_number = 0;
+static const char device_name[] = "hello";
+static int register_device(void)
+{
+	int result = 0;
+	result = register_chrdev( 0, device_name, &fops );
+	if( result < 0 ) {
+            printk( KERN_WARNING "hello: register_chrdev failed,result=%i", result );
+            return result;
+	}
+	device_file_major_number = result;
+	printk( KERN_NOTICE "hello: major number=%i,try \"grep hello /proc/devices\"", device_file_major_number );
+	return 0;
+}
+
+void unregister_device(void)
+{
+    printk( KERN_NOTICE "hello: unregister_device() is called" );
+    if(device_file_major_number != 0) {
+		unregister_chrdev(device_file_major_number, device_name);
+	}
+}
+
+static int __init hello_init(void)
+{
+	register_device();
+	return 0;
+}
+
+module_init(hello_init);
+
+static void __exit hello_exit(void)
+{
+	unregister_device();
+}
+
+module_exit(hello_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(" Danil Ishkov, Apriorit and Valerie Henson");
+MODULE_DESCRIPTION("Module that does nothing");
+MODULE_VERSION("1.0");
diff --git a/test/uti/mpi/001.c b/test/uti/mpi/001.c
new file mode 100755
index 00000000..2584f3a0
--- /dev/null
+++ b/test/uti/mpi/001.c
@@ -0,0 +1,216 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define SZENTRY_DEFAULT (65536) /* Size of one slot */
+#define NENTRY_DEFAULT 10000 /* Number of slots */
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
+	int i;
+	if(rank == 1) {
+		for(i = 0; i < nentry; i++) {
+			MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
+			if (nentry > 10 && i % (nentry / 10) == 0) {
+				printf("s"); fflush(stdout);
+			}
+		}
+		MPI_Waitall(nentry, reqs, status);
+		printf("w\n"); fflush(stdout);
+	} else {
+		for(i = 0; i < nentry; i++) {
+			MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
+			if (nentry > 10 && i % (nentry / 10) == 0) {
+				printf("r"); fflush(stdout);
+			}
+		}
+		usleep(usec);
+		MPI_Waitall(nentry, reqs, status);
+		printf("W\n"); fflush(stdout);
+	}
+}
+
+int main(int argc, char **argv) {
+	int my_rank = -1, size = -1;
+	int i, j;
+	char **sendv, **recvv;
+	MPI_Status* status;
+	MPI_Request* reqs;
+    long szentry;
+    long nentry;
+	int src, dest;
+    struct timespec start, end;
+	double diffusec;
+
+    if(argc == 3) {
+        szentry = atoi(argv[1]);
+        nentry = atoi(argv[2]);
+    } else {
+        szentry = SZENTRY_DEFAULT;
+		nentry = NENTRY_DEFAULT;
+    }
+	printf("szentry=%ld,nentry=%ld\n", szentry, nentry);
+
+    status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
+    reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
+
+    int actual;
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	printf("Thread support level is %d\n", actual);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    src = (size + my_rank - 1) % size;
+    dest = (my_rank + 1) % size;
+
+    printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
+
+	sendv = malloc(sizeof(char *) * nentry);
+	if(!sendv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+#if 0
+		int fd;
+		fd = open("./file", O_RDWR);
+		if(fd == -1) { printf("open failed\n"); goto fn_fail; }
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#else
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+#endif
+		if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
+		memset(sendv[i], 0xaa, szentry);
+	}
+
+	recvv = malloc(sizeof(char *) * nentry);
+	if(!recvv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+#if 0
+		int fd;
+		fd = open("./file", O_RDWR);
+		if(fd == -1) { printf("open failed\n"); goto fn_fail; }
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+#else
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+#endif
+		if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
+		memset(recvv[i], 0, szentry);
+	}
+
+	printf("after memset\n");
+
+	print_cpu_last_executed_on();
+
+	for (i = 0; i < 1; i++) {
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &start);
+		}
+		sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &end);
+			diffusec = DIFFNSEC(end, start) / (double)1000;
+			printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+		}
+
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &start);
+		}
+		sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec);
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &end);
+			printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+		}
+	}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
+
diff --git a/test/uti/mpi/002.c b/test/uti/mpi/002.c
new file mode 100755
index 00000000..5a85014c
--- /dev/null
+++ b/test/uti/mpi/002.c
@@ -0,0 +1,127 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define SZENTRY_DEFAULT (65536) /* Size of one slot */
+#define NENTRY_DEFAULT 10000 /* Number of slots */
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+int main(int argc, char **argv) {
+	int my_rank = -1, size = -1;
+	int i, j;
+    struct timespec start, end;
+
+    int actual;
+
+    printf("nloop=%d\n", atoi(argv[1]));
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	printf("Thread support level is %d\n", actual);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    print_cpu_last_executed_on();
+
+	printf("Before 1st barrier\n"); fflush(stdout);
+    MPI_Barrier(MPI_COMM_WORLD);
+
+	printf("Before 2nd barrier\n"); fflush(stdout);
+    if(my_rank == 0) {
+      clock_gettime(CLOCK_REALTIME, &start);
+    }
+    for (i = 0; i < atoi(argv[1]); i++) {
+		MPI_Barrier(MPI_COMM_WORLD);
+	}
+    if(my_rank == 0) {
+		clock_gettime(CLOCK_REALTIME, &end);
+        printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+	}
+
+
+ fn_exit:
+    //MPI_Finalize();
+	usleep(100000);
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/003.c b/test/uti/mpi/003.c
new file mode 100755
index 00000000..fa696ee2
--- /dev/null
+++ b/test/uti/mpi/003.c
@@ -0,0 +1,188 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define SZENTRY_DEFAULT (65536) /* Size of one slot */
+#define NENTRY_DEFAULT 10000 /* Number of slots */
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
+	int i;
+	if(rank == 1) {
+		for(i = 0; i < nentry; i++) {
+			if (i % (nentry / 10) == 0) {
+				printf("s"); fflush(stdout);
+			}
+			MPI_Isend(sendv[0], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
+		}
+		printf("\n"); fflush(stdout);
+		MPI_Waitall(nentry, reqs, status);
+	} else {
+		for(i = 0; i < nentry; i++) {
+			if (i % (nentry / 10) == 0) {
+				printf("r"); fflush(stdout);
+			}
+			MPI_Irecv(recvv[0], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
+		}
+		usleep(usec);
+		MPI_Waitall(nentry, reqs, status);
+	}
+}
+
+int main(int argc, char **argv) {
+	int my_rank = -1, size = -1;
+	int i, j;
+	char **sendv, **recvv;
+	MPI_Status* status;
+	MPI_Request* reqs;
+    long szentry;
+    long nentry;
+	int src, dest;
+    struct timespec start, end;
+	double diffusec;
+
+    if(argc == 3) {
+        szentry = atoi(argv[1]);
+        nentry = atoi(argv[2]);
+    } else {
+        szentry = SZENTRY_DEFAULT;
+		nentry = NENTRY_DEFAULT;
+    }
+
+    status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
+    reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
+
+    int actual;
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	printf("Thread support level is %d\n", actual);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    src = (size + my_rank - 1) % size;
+    dest = (my_rank + 1) % size;
+
+    printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
+
+	sendv = malloc(sizeof(char *) * nentry);
+	if(!sendv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < 1; i++) {
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+		if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
+		memset(sendv[i], 0xaa, szentry);
+	}
+
+	recvv = malloc(sizeof(char *) * nentry);
+	if(!recvv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < 1; i++) {
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+		if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
+		memset(recvv[i], 0, szentry);
+	}
+
+	printf("after memset\n");
+
+    print_cpu_last_executed_on();
+
+	printf("Before 1st barrier\n"); fflush(stdout);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(my_rank == 0) {
+      clock_gettime(CLOCK_REALTIME, &start);
+    }
+	sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
+	printf("Before 2nd barrier\n"); fflush(stdout);
+    MPI_Barrier(MPI_COMM_WORLD);
+    if(my_rank == 0) {
+		clock_gettime(CLOCK_REALTIME, &end);
+		diffusec = DIFFNSEC(end, start) / (double)1000;
+        printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+	}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/004.c b/test/uti/mpi/004.c
new file mode 100755
index 00000000..bf92ca19
--- /dev/null
+++ b/test/uti/mpi/004.c
@@ -0,0 +1,281 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* isend-calc-wait */
+void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
+	int i;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			r++;
+			req++;
+			MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			s++;
+			req++;
+		}
+	}
+	fwq(calc_nsec);
+	MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+    int actual;
+	int ppn = -1;
+	int nproc;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *sbuf, *rbuf;
+	MPI_Request* reqs;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: Thread support level is %d (it should be 3)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+    reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2);
+	if(!reqs) { printf("malloc failed"); goto fn_fail; }
+
+	sbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!sbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(sbuf, 0, sizeof(double) * ndoubles);
+	printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf);
+
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles);
+	printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf);
+
+	print_cpu_last_executed_on();
+
+	/* Measure isend-wait time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NSKIP 5
+#define NPURE 30
+	for (i = 0; i < NPURE + NSKIP; i++) {
+		if (i == NSKIP) {
+			clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+		}
+		my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_pure_l = DIFFNSEC(end, start) / NPURE;
+	//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+	MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
+
+	/* Measure isend-calc-wait time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 30
+	for (i = 0; i < NOVERALL + NSKIP; i++) {
+		if (i == NSKIP) {
+			clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+		}
+		my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_overall_l = DIFFNSEC(end, start) / NOVERALL;
+	//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
+	MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
+	if (my_rank == 0) {
+		long t_abs = (t_pure * 2) - t_overall;
+		printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
+	}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/005.c b/test/uti/mpi/005.c
new file mode 100755
index 00000000..0803ebb8
--- /dev/null
+++ b/test/uti/mpi/005.c
@@ -0,0 +1,338 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+#if 1
+#define BEGIN_EPOCH(win) do { MPI_Win_fence(0, win); } while(0)
+#define END_EPOCH(win) do { MPI_Win_fence(0, win); } while(0)
+#define BAR_EPOCH do { } while(0)
+#else
+#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
+#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
+#define BAR_EPOCH do { MPI_Barrier(MPI_COMM_WORLD); } while(0)
+#endif
+
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* fence-accumulate-calc-fence */
+void accumulate(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) {
+	int i, j;
+	int r = 0, s = 0;
+	int req = 0;
+	BEGIN_EPOCH(win);
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			for (j = 0; j < ndoubles; j++) {
+				//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
+				MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE, i, i * ndoubles + j, 1, MPI_DOUBLE, MPI_SUM, win);
+			}
+		}
+	}
+	fwq(calc_nsec);
+	END_EPOCH(win);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int ppn = -1;
+	int nproc;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *wbuf, *rbuf;
+	MPI_Win win;
+    struct timespec start, end;
+	long t_fence_l, t_pure_l, t_overall_l;
+	long t_fence, t_pure, t_overall;
+	int opt;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+	/* write-to buffer */
+	wbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!wbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* read-from buffer */
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
+		printf("MPI_Win_create failed,rc=%d\n", rc);
+	}
+
+	print_cpu_last_executed_on();
+
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			wbuf[i * ndoubles + j] = i + 1 + j;
+			rbuf[i * ndoubles + j] = (i + 1) * 2 + j;
+		}
+	}
+	
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+		}
+    }
+#endif	
+	/* Measure fence-fence time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NSKIP 5
+#define NFENCE 30
+	for (i = 0; i < NFENCE + NSKIP; i++) {
+	    if (i == NSKIP) {
+	        clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+        }
+        BEGIN_EPOCH(win);
+        END_EPOCH(win);
+	}
+	BAR_EPOCH;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_fence_l = DIFFNSEC(end, start) / NFENCE;
+	//printf("t_fence (local): %ld usec\n", t_fence_l / 1000UL);
+	MPI_Allreduce(&t_fence_l, &t_fence, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_fence (max): %ld usec\n", t_fence / 1000UL);
+
+	/* Measure fence-acc-fence time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NPURE 30
+	for (i = 0; i < NPURE + NSKIP; i++) {
+	if (i == NSKIP) {
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+}
+		accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0);
+	}
+	BAR_EPOCH;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_pure_l = DIFFNSEC(end, start) / NPURE;
+	//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+	MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
+
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+		}
+	}
+#endif
+
+	/* Measure fenc-acc-calc-fence time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 30
+	for (i = 0; i < NOVERALL + NSKIP; i++) {
+	if (i == NSKIP) {
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+}
+		accumulate(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_fence);
+	}
+	BAR_EPOCH;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_overall_l = DIFFNSEC(end, start) / NOVERALL;
+	//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
+	MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
+	if (my_rank == 0) {
+	long t_abs = (t_pure * 2) - t_overall;
+	printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
+}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/005.sh b/test/uti/mpi/005.sh
new file mode 100755
index 00000000..416e9ecc
--- /dev/null
+++ b/test/uti/mpi/005.sh
@@ -0,0 +1,127 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=$HOME
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+MCK=${MYHOME}/project/os/install
+unset DISABLE_UTI
+
+cmdline="./005"
+
+stop=0
+reboot=0
+go=0
+
+mck=0
+nloops=1
+ppn=1
+
+while getopts srgac:n:mdl:P:o: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=1
+		;;
+	    c) cmdline=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${mck} -eq 1 ]; then
+    mcexec="${mck_dir}/bin/mcexec"
+    mcexecopt="--enable-uti --uti-thread-rank=$uti_thread_rank"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+    mcexecopt="-n $ppn -t $((256 / ppn + 4)) -m 1 $mcexecopt"
+else
+    mcexec=
+    mcexecopt=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+else
+    i_mpi_pin=on
+fi
+
+if [ "$i_mpi_pin" == on ] ; then
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$((omp_num_threads + 1)):scatter"
+else
+    i_mpi_pin_domain=
+fi
+
+if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
+    i_mpi_async_progress_pin=
+else
+    i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+cd ${UTI_MPI_TOP}
+(
+cat <<EOF
+#!/bin/sh
+
+export I_MPI_DEBUG=4
+export I_MPI_HYDRA_DEBUG=on
+export PSM2_RCVTHREAD=0
+
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_domain
+export KMP_AFFINITY=granularity=thread,scatter
+
+export I_MPI_ASYNC_PROGRESS=$async
+$i_mpi_async_progress_pin
+
+
+${MCK}/bin/mcexec taskset -c 3 ./005 --ppn 16
+EOF
+) > ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_MPI_TOP}
+    make CC=gcc 008
+    ./job.sh
+fi
+
+
+
diff --git a/test/uti/mpi/006.c b/test/uti/mpi/006.c
new file mode 100755
index 00000000..d7aa6e61
--- /dev/null
+++ b/test/uti/mpi/006.c
@@ -0,0 +1,625 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <errno.h>
+
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define BUFFER_LENGTH 8000000
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+}
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* isend-calc-wait */
+void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
+	int i;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			r++;
+			req++;
+			MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			s++;
+			req++;
+		}
+	}
+	fwq(calc_nsec);
+	MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
+}
+
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server(int rank) {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  char fn[256];
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  printf("PSM2 client waiting for epid mapping file to appear...\n");
+  while (!fp) {
+    sleep(1);
+    fp = fopen(fn, "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(int rank, psm2_epid_t myepid) {
+  FILE *fp;
+  char fn[256];
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  fp = fopen(fn, "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+int psm2_sendrecv(int rank, int sender, int receiver) {
+  struct psm2_ep_open_opts o;
+  psm2_uuid_t uuid; /* 16 byte */
+  psm2_ep_t myep;
+  psm2_epid_t myepid;
+  psm2_epid_t server_epid;
+  psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+  int epid_array_mask[CONNECT_ARRAY_SIZE];
+  psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+  psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+  char msgbuf[BUFFER_LENGTH];
+  psm2_mq_t q;
+  psm2_mq_req_t req_mq;
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+  *((int *)&uuid) = rand();
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+	return -1;
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+	return -1;
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+	return -1;
+  }
+  printf("PSM2 endpoint open done.\n");
+  int is_server = (rank == receiver) ? 1 : 0;
+  if (is_server) {
+	  write_epid_to_file(rank, myepid);
+  } else {
+	  server_epid = find_server(receiver);
+  }
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+    if ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              0 /* no timeout */
+    )) != PSM2_OK) {
+      die("couldn't ep_connect", rc);
+	  return -1;
+    }
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+      die("couldn't connect to server",
+          epid_connect_errors[0]);
+	  return -1;
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+	return -1;
+  }
+  printf("PSM2 MQ init done.\n");
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+	  return -1;
+    }
+    printf("PSM2 MQ irecv() posted\n");
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+	  return -1;
+	}
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+    unlink("psm2-demo-server-epid");
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+	  return -1;
+    }
+    printf("PSM2 MQ send() done.\n");
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+	  die("couldn't psm2_mq_finalize()", rc);
+	  return -1;
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+	return -1;
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+	  die("couldn't psm2_finalize()", rc);
+	  return -1;
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+struct thr_arg {
+	volatile int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	int rank;
+	int ppn;
+	int nproc;
+};
+
+struct thr_arg thr_arg;
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int i;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
+	}
+
+	printf("progress,enter\n");
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count++;
+	if (thr_arg->bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg->bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+
+#if 0
+	printf("progress,after barrier\n");
+	for (i = 0; i < thr_arg->nproc; i++) {
+		if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
+			if (thr_arg->rank < i) {
+				psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
+			} else {
+				psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
+			}
+		}
+	}
+#endif
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count--;
+	if (thr_arg->bar_count == 0) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg->bar_count != 0) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+
+
+	printf("progress,exit\n");
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+	int ppn = -1;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *sbuf, *rbuf;
+	MPI_Request* reqs;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: Thread support level is %d (it should be 3)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+	/* Spawn a thread */
+	thr_arg.rank = my_rank;
+	thr_arg.ppn = ppn;
+	thr_arg.nproc = nproc;
+	thr_arg.bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	pthread_cond_init(&thr_arg.bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutex_init(&thr_arg.bar_lock, &mutexattr);
+
+	char *uti_str = getenv("DISABLE_UTI");
+	int uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n");
+	}
+
+	rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	/* barrier */
+	pthread_mutex_lock(&thr_arg.bar_lock);
+	thr_arg.bar_count++;
+	if (thr_arg.bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg.bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg.bar_lock);
+
+	printf("parent,after barrier\n");
+
+
+    reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nproc * 2);
+	if(!reqs) { printf("malloc failed"); goto fn_fail; }
+
+	sbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!sbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(sbuf, 0, sizeof(double) * ndoubles);
+	printf("tid=%d,pid=%d,sbuf=%p\n", syscall(__NR_gettid), getpid(), sbuf);
+
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles);
+	printf("tid=%d,pid=%d,rbuf=%p\n", syscall(__NR_gettid), getpid(), rbuf);
+
+	print_cpu_last_executed_on();
+
+	/* Measure isend-wait time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NSKIP 5
+#define NPURE 30
+	for (i = 0; i < NPURE + NSKIP; i++) {
+		if (i == NSKIP) {
+			clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+		}
+		my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, 0);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_pure_l = DIFFNSEC(end, start) / NPURE;
+	//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+	MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
+
+	/* Measure isend-calc-wait time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 30
+	for (i = 0; i < NOVERALL + NSKIP; i++) {
+		if (i == NSKIP) {
+			clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+		}
+		my_send(nproc, ppn, my_rank, sbuf, rbuf, ndoubles, reqs, t_pure);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_overall_l = DIFFNSEC(end, start) / NOVERALL;
+	//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
+	MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
+	if (my_rank == 0) {
+		long t_abs = (t_pure * 2) - t_overall;
+		printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
+	}
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg.bar_lock);
+	thr_arg.bar_count--;
+	if (thr_arg.bar_count == 0) {
+		if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg.bar_count != 0) {
+		if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg.bar_lock);
+
+
+	pthread_join(thr_arg.pthread, NULL);
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/007.c b/test/uti/mpi/007.c
new file mode 100755
index 00000000..af31c581
--- /dev/null
+++ b/test/uti/mpi/007.c
@@ -0,0 +1,563 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <errno.h>
+
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define BUFFER_LENGTH 8000000
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+  exit(1);
+}
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* isend-calc-wait */
+void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
+	int i;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			r++;
+			req++;
+			MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			s++;
+			req++;
+		}
+	}
+	fwq(calc_nsec);
+	MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
+}
+
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server(int rank) {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  char fn[256];
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  printf("PSM2 client waiting for epid mapping file to appear...\n");
+  while (!fp) {
+    sleep(1);
+    fp = fopen(fn, "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(int rank, psm2_epid_t myepid) {
+  FILE *fp;
+  char fn[256];
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  fp = fopen(fn, "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+int psm2_sendrecv(int rank, int sender, int receiver) {
+  struct psm2_ep_open_opts o;
+  psm2_uuid_t uuid;
+  psm2_ep_t myep;
+  psm2_epid_t myepid;
+  psm2_epid_t server_epid;
+  psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+  int epid_array_mask[CONNECT_ARRAY_SIZE];
+  psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+  psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+  char msgbuf[BUFFER_LENGTH];
+  psm2_mq_t q;
+  psm2_mq_req_t req_mq;
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+  }
+  printf("PSM2 endpoint open done.\n");
+  int is_server = (rank == receiver) ? 1 : 0;
+  if (is_server) {
+	  write_epid_to_file(rank, myepid);
+  } else {
+	  server_epid = find_server(receiver);
+  }
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+    if ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              0 /* no timeout */
+    )) != PSM2_OK) {
+      die("couldn't ep_connect", rc);
+    }
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+      die("couldn't connect to server",
+          epid_connect_errors[0]);
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+  }
+  printf("PSM2 MQ init done.\n");
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+    }
+    printf("PSM2 MQ irecv() posted\n");
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+    }
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+    unlink("psm2-demo-server-epid");
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ send() done.\n");
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+    die("couldn't psm2_mq_finalize()", rc);
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+    die("couldn't psm2_finalize()", rc);
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+struct thr_arg {
+	volatile int bar_count; /* Barrier before entering loop */
+	pthread_mutex_t bar_lock;
+	pthread_cond_t bar_cond;
+	pthread_t pthread;
+	int rank;
+	int ppn;
+	int nproc;
+};
+
+struct thr_arg thr_arg;
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int i;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
+	}
+
+	printf("progress,enter\n");
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count++;
+	if (thr_arg->bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg->bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+
+	printf("progress,after barrier\n");
+#if 1
+	for (i = 0; i < thr_arg->nproc; i++) {
+		if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
+			if (thr_arg->rank < i) {
+				psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
+			} else {
+				psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
+			}
+		}
+	}
+#endif
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg->bar_lock);
+	thr_arg->bar_count--;
+	if (thr_arg->bar_count == 0) {
+		if ((rc = pthread_cond_broadcast(&thr_arg->bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg->bar_count != 0) {
+		if ((rc = pthread_cond_wait(&thr_arg->bar_cond, &thr_arg->bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg->bar_lock);
+
+
+	printf("progress,exit\n");
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+	int ppn = -1;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *sbuf, *rbuf;
+	MPI_Request* reqs;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+	pthread_condattr_t condattr;
+	pthread_mutexattr_t mutexattr;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+	char *rank_str = getenv("PMI_RANK");
+	if (!rank_str) {
+		printf("getenv failed\n");
+		exit(1);
+	}
+	my_rank = atoi(rank_str);
+    nproc = 2;
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+	/* Spawn a thread */
+	thr_arg.rank = my_rank;
+	thr_arg.ppn = ppn;
+	thr_arg.nproc = nproc;
+	thr_arg.bar_count = 0;
+
+	pthread_condattr_init(&condattr);
+	pthread_cond_init(&thr_arg.bar_cond, &condattr);
+
+	pthread_mutexattr_init(&mutexattr);
+	pthread_mutex_init(&thr_arg.bar_lock, &mutexattr);
+
+	char *uti_str = getenv("DISABLE_UTI");
+	int uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n");
+	}
+
+	rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	/* barrier */
+	pthread_mutex_lock(&thr_arg.bar_lock);
+	thr_arg.bar_count++;
+	if (thr_arg.bar_count == 2) {
+		if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg.bar_count != 2) {
+		if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg.bar_lock);
+
+	printf("parent,after barrier\n");
+
+
+	print_cpu_last_executed_on();
+
+	/* barrier */
+	pthread_mutex_lock(&thr_arg.bar_lock);
+	thr_arg.bar_count--;
+	if (thr_arg.bar_count == 0) {
+		if ((rc = pthread_cond_broadcast(&thr_arg.bar_cond))) {
+			printf("pthread_cond_broadcast failed,rc=%d\n", rc);
+		}
+	}
+	while (thr_arg.bar_count != 0) {
+		if ((rc = pthread_cond_wait(&thr_arg.bar_cond, &thr_arg.bar_lock))) {
+			printf("pthread_cond_wait failed,rc=%d\n", rc);
+		}
+    }
+	pthread_mutex_unlock(&thr_arg.bar_lock);
+
+
+	pthread_join(thr_arg.pthread, NULL);
+
+ fn_exit:
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/008.c b/test/uti/mpi/008.c
new file mode 100755
index 00000000..6db6e3ae
--- /dev/null
+++ b/test/uti/mpi/008.c
@@ -0,0 +1,589 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <errno.h>
+
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+  fflush(stderr);
+}
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server(int rank) {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__); fflush(stdout);
+
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
+  while (!fp) {
+    usleep(250*1000);
+    fp = fopen(fn, "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(int rank, psm2_epid_t myepid) {
+  FILE *fp;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__);
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  fp = fopen(fn, "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+psm2_uuid_t uuid;
+psm2_ep_t myep;
+psm2_epid_t myepid;
+psm2_epid_t server_epid;
+psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+int epid_array_mask[CONNECT_ARRAY_SIZE];
+psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+
+int my_psm2_init(int my_rank, int server_rank) {
+  struct psm2_ep_open_opts o;
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+  }
+  printf("PSM2 endpoint open done.\n");
+
+  return 0;
+}
+
+psm2_mq_t q;
+
+int my_psm2_connect(int my_rank, int server_rank) {
+	int rc;
+  int is_server = (my_rank == server_rank) ? 1 : 0;
+  printf("%s: enter\n", __FUNCTION__); fflush(stdout);
+  if (is_server) {
+	  write_epid_to_file(my_rank, myepid);
+  } else {
+	  server_epid = find_server(server_rank);
+  }
+  printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+	printf("calling ep_connect\n");
+	int count = 0;
+    while ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              1 /* 0.5 sec timeout */
+    )) != PSM2_OK) {
+		struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
+		nanosleep(&ts, NULL);
+		printf("."); fflush(stdout);
+		count++;
+		if (count > 30) {
+			break;
+		}
+    }
+
+	if (rc != PSM2_OK) {
+		printf("psm2_ep_connect timed-out\n");
+		return -1;
+	}
+
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+		die("couldn't connect to server", epid_connect_errors[0]);
+		return -1;
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+  }
+  printf("PSM2 MQ init done.\n");
+
+	return 0;
+}
+char msgbuf[BUFFER_LENGTH];
+
+int my_psm2_sendrecv(int rank, int sender, int receiver) {
+  int is_server = (rank == receiver) ? 1 : 0;
+  int rc;
+  psm2_mq_req_t req_mq;
+  //char msgbuf[BUFFER_LENGTH];
+
+  register long rsp asm ("rsp");
+  printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
+
+  memset(msgbuf, 0, BUFFER_LENGTH);
+
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+    }
+    printf("PSM2 MQ irecv() posted\n");
+
+#if 0
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+    }
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+
+	if (is_server) {
+		char fn[256];
+		sprintf(fn, "psm2-demo-server-epid-%d", rank);
+		unlink(fn);
+	}
+#else
+	int count = 0;
+    while ((rc = psm2_mq_ipeek(q, &req_mq, NULL)) != PSM2_OK) {
+		struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
+		nanosleep(&ts, NULL);
+		printf("."); fflush(stdout);
+		count++;
+		if (count > 2) {
+			break;
+		}
+	}
+	if (rc == PSM2_OK) {
+		if ((rc = psm2_mq_test(&req_mq, NULL)) != PSM2_OK) {
+			printf("psm2_mq_test failed\n");
+		} else  {
+			printf("PSM2 MQ test() done.\n");
+			printf("Message from client:\n");
+			printf("%s", msgbuf);
+		}
+		char fn[256];
+		sprintf(fn, "psm2-demo-server-epid-%d", rank);
+		unlink(fn);
+	} else {
+		printf("PSM2 MQ test() timed-out.\n");
+	}
+#endif
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+#if 0
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ send() done.\n");
+#else
+    if ((rc = psm2_mq_isend2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+							 msgbuf, BUFFER_LENGTH,
+							 NULL, /* no context to add */
+							 &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ isend() posted\n");
+
+	int count = 0;
+    while ((rc = psm2_mq_ipeek2(q, &req_mq, NULL)) != PSM2_OK) {
+		struct timespec ts = { .tv_sec = 0, .tv_nsec = 500*1000*1000 };
+		nanosleep(&ts, NULL);
+		printf("."); fflush(stdout);
+		count++;
+		if (count > 30) {
+			break;
+		}
+	}
+	if (rc == PSM2_OK) {
+		if ((rc = psm2_mq_test2(&req_mq, NULL)) != PSM2_OK) {
+			printf("PSM2 MQ test() failed.\n");
+		} else {
+			printf("PSM2 MQ test() done.\n");
+		}
+	} else {
+		printf("PSM2 MQ test() timeout.\n");
+	}
+#endif
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+    die("couldn't psm2_mq_finalize()", rc);
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+    die("couldn't psm2_finalize()", rc);
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+struct thr_arg {
+	pthread_barrier_t bar;
+	pthread_t pthread;
+	int rank;
+	int ppn;
+	int nproc;
+};
+
+struct thr_arg thr_arg;
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int i;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
+	}
+
+	printf("progress,enter\n");
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+#if 1
+	for (i = 0; i < thr_arg->nproc; i++) {
+		if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
+			if (thr_arg->rank < i) {
+				my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
+			} else {
+				my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
+			}
+		}
+	}
+#endif
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+
+#if 0
+	printf("progress,entering infinite loop\n");
+	while(1) { }
+#endif
+	printf("progress,returning\n");
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+	int ppn = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+	pthread_barrierattr_t barrierattr;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ppn == -1) {
+		printf("specify ppn with --ppn");
+		exit(1);
+	}
+
+	char *rank_str = getenv("PMI_RANK");
+	if (!rank_str) {
+		printf("getenv failed\n");
+		exit(1);
+	}
+	my_rank = atoi(rank_str);
+	printf("my_rank=%d\n", my_rank); fflush(stdout);
+
+    nproc = 2;
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+	
+	int server_rank = ppn + (my_rank % ppn);
+	my_psm2_init(my_rank, server_rank);
+	my_psm2_connect(my_rank, server_rank);
+
+	/* Spawn a thread */
+	thr_arg.rank = my_rank;
+	thr_arg.ppn = ppn;
+	thr_arg.nproc = nproc;
+
+	pthread_barrierattr_init(&barrierattr);
+	pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
+
+	char *uti_str = getenv("DISABLE_UTI");
+	int uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n");
+	}
+
+	rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	pthread_barrier_wait(&thr_arg.bar);
+
+	pthread_barrier_wait(&thr_arg.bar);
+
+	pthread_join(thr_arg.pthread, NULL);
+
+ fn_exit:
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/008.sh b/test/uti/mpi/008.sh
new file mode 100755
index 00000000..24a310cf
--- /dev/null
+++ b/test/uti/mpi/008.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=$HOME
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+MCK=${MYHOME}/project/os/install
+unset DISABLE_UTI
+
+cmdline="./008"
+
+stop=0
+reboot=0
+go=0
+
+mck=0
+nloops=1
+
+while getopts srgac:n:mdl: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=1
+		;;
+	    c) cmdline=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) 
+		mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+	    l) nloops=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+if [ ${mck} -eq 1 ]; then
+    MCEXEC="${MCK}/bin/mcexec"
+else
+    MCEXEC=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_MPI_TOP}
+    make CC=gcc 008
+    for i in `seq 1 ${nloops}`; do
+	rm -f psm2-demo-server-epid-*
+	#PSM2_RCVTHREAD=0 PMI_RANK=0 DISABLE_UTI=1 ${MCK}/bin/mcexec --enable-uti taskset -c 2 ./008 --ppn 1 &
+	PSM2_RCVTHREAD=0 PMI_RANK=1 DISABLE_UTI=0 ${MCK}/bin/mcexec --enable-uti taskset -c 3 ./008 --ppn 1
+	#wait
+	echo =====;
+	echo $i;
+	echo =====; i=$((i+1));
+	#sleep 2
+    done
+fi
+
+
+
diff --git a/test/uti/mpi/009.c b/test/uti/mpi/009.c
new file mode 100755
index 00000000..3a1209a5
--- /dev/null
+++ b/test/uti/mpi/009.c
@@ -0,0 +1,537 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <errno.h>
+
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+  fflush(stderr);
+}
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* isend-calc-wait */
+void my_send(int nproc, int ppn, int rank, double *sbuf, double *rbuf, int ndoubles, MPI_Request* reqs, long calc_nsec) {
+	int i;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			MPI_Irecv(rbuf + r * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			r++;
+			req++;
+			MPI_Isend(sbuf + s * ndoubles, ndoubles, MPI_DOUBLE, i, 0, MPI_COMM_WORLD, &reqs[req]);
+			s++;
+			req++;
+		}
+	}
+	fwq(calc_nsec);
+	MPI_Waitall(req, reqs, MPI_STATUSES_IGNORE);
+}
+
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server(int rank) {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__); fflush(stdout);
+
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
+  while (!fp) {
+    sleep(1);
+    fp = fopen(fn, "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(int rank, psm2_epid_t myepid) {
+  FILE *fp;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__);
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  fp = fopen(fn, "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+psm2_uuid_t uuid;
+psm2_ep_t myep;
+psm2_epid_t myepid;
+psm2_epid_t server_epid;
+psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+int epid_array_mask[CONNECT_ARRAY_SIZE];
+psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+
+int my_psm2_init(int my_rank, int server_rank) {
+  struct psm2_ep_open_opts o;
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+
+  printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+  }
+  printf("PSM2 endpoint open done.\n");
+
+  return 0;
+}
+int my_psm2_connect(int my_rank, int server_rank) {
+	int rc;
+  int is_server = (my_rank == server_rank) ? 1 : 0;
+  printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
+  if (is_server) {
+	  write_epid_to_file(my_rank, myepid);
+  } else {
+	  server_epid = find_server(server_rank);
+  }
+  printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+    if ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              0 /* no timeout */
+    )) != PSM2_OK) {
+		die("couldn't ep_connect", rc);
+		return -1;
+    }
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+      die("couldn't connect to server", epid_connect_errors[0]);
+		return -1;
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+	return 0;
+}
+char msgbuf[BUFFER_LENGTH];
+
+int my_psm2_sendrecv(int rank, int sender, int receiver) {
+  int is_server = (rank == receiver) ? 1 : 0;
+  int rc;
+  psm2_mq_t q;
+  psm2_mq_req_t req_mq;
+  //char msgbuf[BUFFER_LENGTH];
+
+  register long rsp asm ("rsp");
+  printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
+
+  memset(msgbuf, 0, BUFFER_LENGTH);
+
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+  }
+  printf("PSM2 MQ init done.\n");
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+    }
+    printf("PSM2 MQ irecv() posted\n");
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+    }
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+
+	if (is_server) {
+		char fn[256];
+		sprintf(fn, "psm2-demo-server-epid-%d", rank);
+		unlink(fn);
+	}
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ send() done.\n");
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+    die("couldn't psm2_mq_finalize()", rc);
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+    die("couldn't psm2_finalize()", rc);
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+struct thr_arg {
+	pthread_barrier_t bar;
+	pthread_t pthread;
+	int rank;
+	int ppn;
+	int nproc;
+	int server_rank;
+};
+
+struct thr_arg thr_arg;
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int i;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
+	}
+
+	printf("progress,enter\n");
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+#if 1
+	my_psm2_init(thr_arg->rank, thr_arg->server_rank);
+	my_psm2_connect(thr_arg->rank, thr_arg->server_rank);
+
+	for (i = 0; i < thr_arg->nproc; i++) {
+		if (!on_same_node(thr_arg->ppn, thr_arg->rank, i)) {
+			if (thr_arg->rank < i) {
+				my_psm2_sendrecv(thr_arg->rank, thr_arg->rank, i);
+			} else {
+				my_psm2_sendrecv(thr_arg->rank, i, thr_arg->rank);
+			}
+		}
+	}
+#endif
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+
+	printf("progress,exit\n");
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+	int ppn = -1;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *sbuf, *rbuf;
+	MPI_Request* reqs;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+	pthread_barrierattr_t barrierattr;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+	char *rank_str = getenv("PMI_RANK");
+	if (!rank_str) {
+		printf("getenv failed\n");
+		exit(1);
+	}
+	my_rank = atoi(rank_str);
+	printf("my_rank=%d\n", my_rank); fflush(stdout);
+
+    nproc = 2;
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,ndoubles=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+	
+
+	/* Spawn a thread */
+	thr_arg.rank = my_rank;
+	thr_arg.ppn = ppn;
+	thr_arg.nproc = nproc;
+	thr_arg.server_rank = ppn + (my_rank % ppn);
+
+	pthread_barrierattr_init(&barrierattr);
+	pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
+
+	char *uti_str = getenv("DISABLE_UTI");
+	int uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n");
+	}
+
+	rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	pthread_barrier_wait(&thr_arg.bar);
+
+	pthread_barrier_wait(&thr_arg.bar);
+
+	pthread_join(thr_arg.pthread, NULL);
+
+ fn_exit:
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/010.c b/test/uti/mpi/010.c
new file mode 100755
index 00000000..65ed6d55
--- /dev/null
+++ b/test/uti/mpi/010.c
@@ -0,0 +1,508 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <errno.h>
+
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define BUFFER_LENGTH /*8000000*/(1ULL<<12)
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+  fflush(stderr);
+}
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,pid=%d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, getpid(), tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server(int rank) {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__); fflush(stdout);
+
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  printf("PSM2 client waiting for epid mapping file to appear...\n"); fflush(stdout);
+  while (!fp) {
+    sleep(1);
+    fp = fopen(fn, "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(int rank, psm2_epid_t myepid) {
+  FILE *fp;
+  char fn[256];
+  printf("%s: enter\n", __FUNCTION__);
+  sprintf(fn, "psm2-demo-server-epid-%d", rank);
+  fp = fopen(fn, "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+psm2_uuid_t uuid;
+psm2_ep_t myep;
+psm2_epid_t myepid;
+psm2_epid_t server_epid;
+psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+int epid_array_mask[CONNECT_ARRAY_SIZE];
+psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+
+int my_psm2_init(int my_rank, int server_rank) {
+  struct psm2_ep_open_opts o;
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+
+  printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+  }
+  printf("PSM2 endpoint open done.\n");
+
+  return 0;
+}
+int my_psm2_connect(int my_rank, int server_rank) {
+	int rc;
+  int is_server = (my_rank == server_rank) ? 1 : 0;
+  printf("%s: my_rank=%d,server_rank=%d\n", __FUNCTION__, my_rank, server_rank); fflush(stdout);
+  if (is_server) {
+	  write_epid_to_file(my_rank, myepid);
+  } else {
+	  server_epid = find_server(server_rank);
+  }
+  printf("%s: epid exchange done\n", __FUNCTION__); fflush(stdout);
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+    if ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              0 /* no timeout */
+    )) != PSM2_OK) {
+		die("couldn't ep_connect", rc);
+		return -1;
+    }
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+      die("couldn't connect to server", epid_connect_errors[0]);
+		return -1;
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+	return 0;
+}
+char msgbuf[BUFFER_LENGTH];
+
+int my_psm2_sendrecv(int rank, int sender, int receiver) {
+  int is_server = (rank == receiver) ? 1 : 0;
+  int rc;
+  psm2_mq_t q;
+  psm2_mq_req_t req_mq;
+  //char msgbuf[BUFFER_LENGTH];
+
+  register long rsp asm ("rsp");
+  printf("rsp=%lx.msgbuf=%p\n", rsp, msgbuf); fflush(stdout);
+
+  memset(msgbuf, 0, BUFFER_LENGTH);
+
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+  }
+  printf("PSM2 MQ init done.\n");
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+    }
+    printf("PSM2 MQ irecv() posted\n");
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+    }
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+
+	if (is_server) {
+		char fn[256];
+		sprintf(fn, "psm2-demo-server-epid-%d", rank);
+		unlink(fn);
+	}
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ send() done.\n");
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+    die("couldn't psm2_mq_finalize()", rc);
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+    die("couldn't psm2_finalize()", rc);
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+struct thr_arg {
+	pthread_barrier_t bar;
+	pthread_t pthread;
+	int rank;
+	int ppn;
+	int nproc;
+	int server_rank;
+};
+
+struct thr_arg thr_arg;
+
+void *progress_fn(void *arg) {
+	struct thr_arg *thr_arg = (struct thr_arg *)arg;
+	int rc;
+	int i;
+	
+	rc = syscall(732);
+	if (rc == -1)
+		fprintf(stdout, "CT09100 progress_fn running on Linux OK\n");
+	else {
+		fprintf(stdout, "CT09100 progress_fn running on McKernel NG (%d)\n", rc);
+	}
+
+	printf("progress,enter\n");
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+	pthread_barrier_wait(&thr_arg->bar);
+
+
+	printf("progress,exit\n");
+	return NULL;
+}
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+	int ppn = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+    struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+	pthread_barrierattr_t barrierattr;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ppn == -1) {
+		printf("specify ppn with --ppn");
+		exit(1);
+	}
+
+	char *rank_str = getenv("PMI_RANK");
+	if (!rank_str) {
+		printf("getenv failed\n");
+		exit(1);
+	}
+	my_rank = atoi(rank_str);
+	printf("my_rank=%d\n", my_rank); fflush(stdout);
+
+    nproc = 2;
+
+	if (my_rank == 0) {
+		printf("tid=%d,pid=%d,nproc=%d\n", syscall(__NR_gettid), getpid(), nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+	
+
+	/* Spawn a thread */
+	thr_arg.rank = my_rank;
+	thr_arg.ppn = ppn;
+	thr_arg.nproc = nproc;
+	thr_arg.server_rank = ppn + (my_rank % ppn);
+
+	pthread_barrierattr_init(&barrierattr);
+	pthread_barrier_init(&thr_arg.bar, &barrierattr, nproc);
+
+	char *uti_str = getenv("DISABLE_UTI");
+	int uti_val = uti_str ? atoi(uti_str) : 0;
+	if (!uti_val) {
+		rc = syscall(731, 1, NULL);
+		if (rc) {
+			fprintf(stdout, "CT09003 INFO: uti not available (rc=%d)\n", rc);
+		} else {
+			fprintf(stdout, "CT09003 INFO: uti available\n");
+		}
+	} else {
+		fprintf(stdout, "CT09003 INFO: uti disabled\n");
+	}
+
+	rc = pthread_create(&thr_arg.pthread, NULL, progress_fn, &thr_arg);
+	if (rc){
+		fprintf(stdout, "pthread_create: %d\n", rc);
+		exit(1);
+	}
+	
+	pthread_barrier_wait(&thr_arg.bar);
+
+	my_psm2_init(thr_arg.rank, thr_arg.server_rank);
+	my_psm2_connect(thr_arg.rank, thr_arg.server_rank);
+
+	for (i = 0; i < thr_arg.nproc; i++) {
+		if (!on_same_node(thr_arg.ppn, thr_arg.rank, i)) {
+			if (thr_arg.rank < i) {
+				my_psm2_sendrecv(thr_arg.rank, thr_arg.rank, i);
+			} else {
+				my_psm2_sendrecv(thr_arg.rank, i, thr_arg.rank);
+			}
+		}
+	}
+
+	pthread_barrier_wait(&thr_arg.bar);
+
+	pthread_join(thr_arg.pthread, NULL);
+
+ fn_exit:
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/011.c b/test/uti/mpi/011.c
new file mode 100755
index 00000000..0cc48cb3
--- /dev/null
+++ b/test/uti/mpi/011.c
@@ -0,0 +1,220 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define SZENTRY_DEFAULT (65536) /* Size of one slot */
+#define NENTRY_DEFAULT 10000 /* Number of slots */
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+void sendrecv(int rank, int nentry, char **sendv, char **recvv, int szentry, int src, int dest, MPI_Request* reqs, MPI_Status* status, double usec) {
+	int i;
+	if(rank == 1) {
+		for(i = 0; i < nentry; i++) {
+			MPI_Isend(sendv[i], szentry, MPI_CHAR, dest, 0, MPI_COMM_WORLD, &reqs[i]);
+			if (nentry > 10 && i % (nentry / 10) == 0) {
+				printf("s"); fflush(stdout);
+			}
+		}
+		MPI_Waitall(nentry, reqs, status);
+		printf("w\n"); fflush(stdout);
+	} else {
+		for(i = 0; i < nentry; i++) {
+			MPI_Irecv(recvv[i], szentry, MPI_CHAR, src, 0, MPI_COMM_WORLD, &reqs[i]);
+			if (nentry > 10 && i % (nentry / 10) == 0) {
+				printf("r"); fflush(stdout);
+			}
+		}
+		usleep(usec);
+		MPI_Waitall(nentry, reqs, status);
+		printf("W\n"); fflush(stdout);
+	}
+}
+
+int main(int argc, char **argv) {
+	int my_rank = -1, size = -1;
+	int i, j;
+	char **sendv, **recvv;
+	MPI_Status* status;
+	MPI_Request* reqs;
+    long szentry;
+    long nentry;
+	int src, dest;
+    struct timespec start, end;
+	double diffusec;
+
+    if(argc == 3) {
+        szentry = atoi(argv[1]);
+        nentry = atoi(argv[2]);
+    } else {
+        szentry = SZENTRY_DEFAULT;
+		nentry = NENTRY_DEFAULT;
+    }
+	printf("szentry=%ld,nentry=%ld\n", szentry, nentry);
+
+    status = (MPI_Status*)malloc(sizeof(MPI_Status) * nentry);
+    reqs = (MPI_Request*)malloc(sizeof(MPI_Request) * nentry);
+
+    int actual;
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	printf("Thread support level is %d\n", actual);
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &size);
+
+    src = (size + my_rank - 1) % size;
+    dest = (my_rank + 1) % size;
+
+    printf("rank=%d, size=%d, src=%d, dest=%d\n", my_rank, size, src, dest);
+
+	sendv = malloc(sizeof(char *) * nentry);
+	if(!sendv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+#if 0
+		int fd;
+		fd = open("./file", O_RDWR);
+		if(fd == -1) { printf("open failed\n"); goto fn_fail; }
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
+#else
+		sendv[i] = (char*)mmap(0, szentry, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+#endif
+		if(sendv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] sendv[%d]=%p\n", my_rank, i, sendv[i]);
+		memset(sendv[i], 0xaa, szentry);
+	}
+
+	recvv = malloc(sizeof(char *) * nentry);
+	if(!recvv) { printf("malloc failed"); goto fn_fail; }
+	for (i = 0; i < nentry; i++) {
+#if 0
+		int fd;
+		fd = open("./file", O_RDWR);
+		if(fd == -1) { printf("open failed\n"); goto fn_fail; }
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+#else
+		recvv[i] = (char*)mmap(0, szentry, PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+#endif
+		if(recvv[i] == MAP_FAILED) { printf("mmap failed"); goto fn_fail; }
+		dprintf("[%d] recvv[%d]=%p\n", my_rank, i, recvv[i]);
+		memset(recvv[i], 0, szentry);
+	}
+
+	printf("after memset\n");
+
+	print_cpu_last_executed_on();
+
+#pragma omp parallel for
+	for (i = 0; i < omp_get_num_threads(); i++) {
+		printf("thread_num=%d,tid=%d\n", i, syscall(SYS_gettid));
+	}
+
+	for (i = 0; i < 1; i++) {
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &start);
+		}
+		sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, 0);
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &end);
+			diffusec = DIFFNSEC(end, start) / (double)1000;
+			printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+		}
+
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &start);
+		}
+		sendrecv(my_rank, nentry, sendv, recvv, szentry, src, dest, reqs, status, diffusec);
+		MPI_Barrier(MPI_COMM_WORLD);
+		if(my_rank == 0) {
+			clock_gettime(CLOCK_REALTIME, &end);
+			printf("%4.4f sec\n", DIFFNSEC(end, start) / (double)1000000000); fflush(stdout);
+		}
+	}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/012.c b/test/uti/mpi/012.c
new file mode 100755
index 00000000..9510de5e
--- /dev/null
+++ b/test/uti/mpi/012.c
@@ -0,0 +1,338 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
+#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
+#define FLUSH(win) do { MPI_Win_flush_local_all(win); } while(0)
+
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 0
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* get_acc-calc-flush_local */
+void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, double *result, int ndoubles, MPI_Win win, long calc_nsec) {
+	int i, j;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			for (j = 0; j < ndoubles; j++) {
+				//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
+				MPI_Get_accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
+								   result + i * ndoubles + j, 1, MPI_DOUBLE,
+								   i, i * ndoubles + j, 1, MPI_DOUBLE,
+								   MPI_SUM, win);
+			}
+		}
+	}
+	fwq(calc_nsec);
+	FLUSH(win);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int ppn = -1;
+	int nproc;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *wbuf, *rbuf, *result;
+	MPI_Win win;
+    struct timespec start, end;
+	long t_flush_l, t_pure_l, t_overall_l;
+	long t_flush, t_pure, t_overall;
+	int opt;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+	/* accumulate-to buffer */
+	wbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!wbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* read-from buffer */
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* fetch-to buffer */
+	result = malloc(sizeof(double) * ndoubles * nproc);
+	if(!result) { printf("malloc failed"); goto fn_fail; }
+	memset(result, 0, sizeof(double) * ndoubles * nproc);
+
+	/* Expose accumulate-to buffer*/
+	if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
+		printf("MPI_Win_create failed,rc=%d\n", rc);
+	}
+
+	//print_cpu_last_executed_on();
+
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
+			rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
+			result[i * ndoubles + j] = (i + 1) * 100000 + (j + 1);
+		}
+	}
+	
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+			printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
+		}
+    }
+#endif	
+	/* Measure flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NFENCE 10
+	BEGIN_EPOCH(win);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < NFENCE; i++) {
+		FLUSH(win);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	END_EPOCH(win);
+	t_flush_l = DIFFNSEC(end, start) / NFENCE;
+	//printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL);
+	MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL);
+
+	/* Measure get_acc-flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NPURE 10
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < NPURE; i++) {
+		BEGIN_EPOCH(win);
+		rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, 0);
+		END_EPOCH(win);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_pure_l = DIFFNSEC(end, start) / NPURE;
+	//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+	MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
+
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+			printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
+		}
+	}
+#endif
+
+	/* Measure get_acc-calc-flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 10
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < NOVERALL; i++) {
+		BEGIN_EPOCH(win);
+		rma(nproc, ppn, my_rank, wbuf, rbuf, result, ndoubles, win, t_pure - t_flush);
+		END_EPOCH(win);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_overall_l = DIFFNSEC(end, start) / NOVERALL;
+	//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
+	MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
+	if (my_rank == 0) {
+	long t_abs = (t_pure * 2) - t_overall;
+	printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
+}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/012.sh b/test/uti/mpi/012.sh
new file mode 100755
index 00000000..e5ff9bb6
--- /dev/null
+++ b/test/uti/mpi/012.sh
@@ -0,0 +1,174 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/work/gg10/e29005
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+MCK=${MYHOME}/project/os/install
+unset DISABLE_UTI
+
+stop=0
+reboot=0
+go=0
+
+async=0
+mck=0
+nnodes=2
+LASTNODE=8200
+ndoubles=10 #12-15
+omp_num_threads=1
+ppn=1 #16
+async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
+lpp=4 # logical-per-physical
+ncpu_mt=256 # number of CPUs for main-thread
+
+while getopts srga:c:n:mdl:N:P:o: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
+
+if [ ${mck} -eq 1 ]; then
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="--uti-thread-rank=$uti_thread_rank"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+    mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
+else
+    mcexec=
+    mcexecopt=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+else
+    # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
+    i_mpi_pin=on
+    if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
+	domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
+    else
+	domain=$((ncpu_mt / ppn)) # Use logical as well
+    fi 
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+fi
+
+if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
+    i_mpi_async_progress_pin=
+else
+    i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
+fi
+
+if [ ${stop} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+cd ${UTI_MPI_TOP}
+(
+cat <<EOF
+#!/bin/sh
+
+export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
+export I_MPI_HYDRA_BOOTSTRAP=ssh
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_domain
+$i_mpi_pin_order
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+export KMP_AFFINITY=granularity=thread,scatter
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=$async
+$i_mpi_async_progress_pin
+
+#export I_MPI_STATS=native:20,ipm
+export I_MPI_STATS=ipm
+export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt `pwd`/012 --ppn $ppn -d $ndoubles
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_MPI_TOP}
+    if [ $mck -eq 1 ]; then
+	make clean && make 012
+    else
+	make clean && make CC=mpiicc 012
+    fi
+    ./job.sh
+fi
+
+
+
diff --git a/test/uti/mpi/013.c b/test/uti/mpi/013.c
new file mode 100755
index 00000000..0f3bc2b1
--- /dev/null
+++ b/test/uti/mpi/013.c
@@ -0,0 +1,335 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
+#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
+
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 0
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+		//printf("%s: delay_nsec < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+	while (1) {
+		clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on OFP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+static inline int on_same_node(int ppn, int me, int you) {
+	return (me / ppn == you / ppn);
+}
+
+/* get_acc-calc-flush_local */
+void rma(int nproc, int ppn, int rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec, int flush_only) {
+	int i, j;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (!on_same_node(ppn, rank, i)) {
+			for (j = 0; j < ndoubles; j++) {
+				//printf("i=%d,j=%d,rbuf=%f,wbuf=%f\n", i, j, rbuf[i * ndoubles + j], wbuf[i * ndoubles + j]);
+				if (!flush_only) {
+					MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
+								   i, i * ndoubles + j, 1, MPI_DOUBLE,
+								   MPI_SUM, win);
+				}
+				MPI_Win_flush_local(i, win);
+			}
+		}
+	}
+	fwq(calc_nsec);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int ppn = -1;
+	int nproc;
+    int ndoubles = -1;
+	int my_rank = -1, size = -1;
+	int i, j;
+	double *wbuf, *rbuf;
+	MPI_Win win;
+    struct timespec start, end;
+	long t_flush_l, t_pure_l, t_overall_l;
+	long t_flush, t_pure, t_overall;
+	int opt;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = (1ULL << atoi(optarg));
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("ndoubles=%d,nproc=%d\n", ndoubles, nproc); 
+		printf("nsec=%ld, nspw=%f\n", nsec, nspw);
+	}
+
+	/* accumulate-to buffer */
+	wbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!wbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* read-from buffer */
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* Expose accumulate-to buffer*/
+	if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
+		printf("MPI_Win_create failed,rc=%d\n", rc);
+	}
+
+	//print_cpu_last_executed_on();
+
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
+			rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
+		}
+	}
+	
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+		}
+    }
+#endif	
+
+	/* Measure flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NFENCE 10
+	BEGIN_EPOCH(win);
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	for (i = 0; i < NFENCE; i++) {
+		rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 1);
+	}
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	END_EPOCH(win);
+	t_flush_l = DIFFNSEC(end, start) / NFENCE;
+	//printf("t_flush (local): %ld usec\n", t_flush_l / 1000UL);
+	MPI_Allreduce(&t_flush_l, &t_flush, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_flush (max): %ld usec\n", t_flush / 1000UL);
+
+	/* Measure get_acc-flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NPURE 10
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	//MPI_Pcontrol(1, "rma");
+	for (i = 0; i < NPURE; i++) {
+		BEGIN_EPOCH(win);
+		rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, 0, 0);
+		END_EPOCH(win);
+	}
+	//MPI_Pcontrol(-1, "rma");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_pure_l = DIFFNSEC(end, start) / NPURE;
+	//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+	MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_pure (max): %ld usec\n", t_pure / 1000UL);
+
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+		}
+	}
+#endif
+
+	/* Measure get_acc-calc-flush time */
+	MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 10
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+	//MPI_Pcontrol(1, "rma-calc");
+	for (i = 0; i < NOVERALL; i++) {
+		BEGIN_EPOCH(win);
+		rma(nproc, ppn, my_rank, wbuf, rbuf, ndoubles, win, t_pure - t_flush, 0);
+		END_EPOCH(win);
+	}
+	//MPI_Pcontrol(-1, "rma-calc");
+	clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+	t_overall_l = DIFFNSEC(end, start) / NOVERALL;
+	//printf("t_overall (local): %ld usec\n", t_overall_l / 1000UL);
+	MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+	if (my_rank == 0) printf("t_overall (max): %ld usec\n", t_overall / 1000UL);
+	if (my_rank == 0) {
+	long t_abs = (t_pure * 2) - t_overall;
+	printf("overlap: %.2f %%\n", (t_abs * 100) / (double)t_pure);
+}
+
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/013.sh b/test/uti/mpi/013.sh
new file mode 100755
index 00000000..56edfe86
--- /dev/null
+++ b/test/uti/mpi/013.sh
@@ -0,0 +1,176 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/work/gg10/e29005
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+MCK=${MYHOME}/project/os/install
+unset DISABLE_UTI
+
+stop=0
+reboot=0
+go=0
+
+async=0
+mck=0
+nnodes=4
+LASTNODE=8200
+ndoubles=10 #12-15
+omp_num_threads=1
+ppn=16 #16
+async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
+lpp=4 # logical-per-physical
+ncpu_mt=256 # number of CPUs for main-thread
+exe=`basename $0 | sed 's/\.sh$//'`
+
+while getopts srga:c:n:mdl:N:P:o: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) mck=1
+                ;;
+            d) export DISABLE_UTI=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
+
+if [ ${mck} -eq 1 ]; then
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="--uti-thread-rank=$uti_thread_rank"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+    mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
+else
+    mcexec=
+    mcexecopt=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+else
+    # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
+    i_mpi_pin=on
+    if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
+	domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
+    else
+	domain=$((ncpu_mt / ppn)) # Use logical as well
+    fi 
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+fi
+
+if [ $async -eq 0 ] || [ "$async_progress_pin" == "" ] ; then
+    i_mpi_async_progress_pin=
+else
+    i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
+fi
+
+if [ ${stop} -eq 1 ]; then
+    
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	    sudo ${MCK}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo mount /work
+
+    if [ ${mck} -eq 1 ]; then
+	if hostname | grep ofp &>/dev/null; then
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    sudo ${MCK}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+cd ${UTI_MPI_TOP}
+(
+cat <<EOF
+#!/bin/sh
+
+export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
+export I_MPI_HYDRA_BOOTSTRAP=ssh
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_domain
+$i_mpi_pin_order
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+export KMP_AFFINITY=granularity=thread,scatter
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=$async
+$i_mpi_async_progress_pin
+
+#export I_MPI_STATS=native:20,ipm
+export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt `pwd`/$exe --ppn $ppn -d $ndoubles
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_MPI_TOP}
+    if [ $mck -eq 1 ]; then
+	make clean && make $exe
+    else
+	make clean && make CC=mpiicc $exe
+    fi
+    ./job.sh
+fi
+
+
+
diff --git a/test/uti/mpi/014.c b/test/uti/mpi/014.c
new file mode 100755
index 00000000..6fa95045
--- /dev/null
+++ b/test/uti/mpi/014.c
@@ -0,0 +1,242 @@
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <getopt.h>
+#include <sched.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "async_progress.h"
+#include "util.h"
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+static struct option options[] = {
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int nproc;
+    int nsamples = -1;
+	int my_rank = -1, size = -1;
+	int i, j, k, l, m;
+	double *wbuf, *rbuf, *result;
+	MPI_Win win;
+    long start, end;
+	long t_pure_l, t_pure, t_pure0 = 0;
+	int opt;
+	int szbuf = 8;
+	struct rusage ru_start, ru_end;
+	struct timeval tv_start, tv_end;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+n:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'n':
+				nsamples = atoi(optarg);
+				break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (nsamples == -1) {
+		printf("specify nsamples with -n");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("nsamples=%d,nproc=%d\n", nsamples, nproc); 
+	}
+
+	/* accumulate-to buffer */
+	wbuf = malloc(sizeof(double) * szbuf);
+	if(!wbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(wbuf, 0, sizeof(double) * szbuf);
+
+	/* read-from buffer */
+	rbuf = malloc(sizeof(double) * szbuf);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * szbuf);
+
+	/* fetch-to buffer */
+	result = malloc(sizeof(double) * szbuf);
+	if(!result) { printf("malloc failed"); goto fn_fail; }
+	memset(result, 0, sizeof(double) * szbuf);
+
+	/* Expose accumulate-to buffer*/
+	if (rc = MPI_Win_create(wbuf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
+		printf("MPI_Win_create failed,rc=%d\n", rc);
+	}
+
+	for (j = 0; j < szbuf; j++) {
+		wbuf[j] = j + 1;
+		rbuf[j] = 10000 + j + 1;
+		result[j] = 100000 + j + 1;
+	}
+	
+#if 0
+		for (j = 0; j < szbuf; j++) {
+			printf("wbuf,j=%d,val=%f\n", j, wbuf[j]);
+			printf("rbuf,j=%d,val=%f\n", j, rbuf[j]);
+			printf("result,j=%d,val=%f\n", j, result[j]);
+		}
+    }
+#endif	
+
+	for (k = 0; k < 2; k++) {
+
+		if (k == 1) {
+			
+			print_cpu_last_executed_on("main");
+
+			INIT_ASYNC_THREAD_();
+
+			if ((rc = getrusage(RUSAGE_THREAD, &ru_start))) {
+				printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc);
+			}
+			
+			if ((rc = gettimeofday(&tv_start, NULL))) {
+				printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc);
+			}
+
+			syscall(701, 1 | 2 | 0x80000000);
+		}
+
+		for (m = 0; m < 3; m++) {
+
+			for (l = 0; l <= 10; l++) {
+				long calc_cyc = /*(k == 1 && l == 0) ? (double)t_pure0 * 0.1 :*/ t_pure0 / 10 * l; 
+
+			MPI_Barrier(MPI_COMM_WORLD);
+			MPI_Win_lock_all(0, win);
+			//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+
+			start = rdtsc_light();
+			for (j = 0; j < nsamples; j++) {
+				for (i = 0; i < nproc; i++) {
+					int target = j % nproc;
+					if (target == my_rank) {
+						continue;
+					}
+#if 0
+					MPI_Get_accumulate(rbuf + j % szbuf, 1, MPI_DOUBLE,
+									   result + j % szbuf, 1, MPI_DOUBLE,
+									   i,
+									   j % szbuf, 1, MPI_DOUBLE,
+									   MPI_SUM, win);
+#endif
+#if 1
+					MPI_Get_accumulate(rbuf, szbuf, MPI_DOUBLE,
+									   result, szbuf, MPI_DOUBLE,
+									   i,
+									   0, szbuf, MPI_DOUBLE,
+									   MPI_SUM, win);
+#endif
+#if 0
+					MPI_Accumulate(rbuf, szbuf, MPI_DOUBLE,
+							i,
+							0, szbuf, MPI_DOUBLE,
+							MPI_SUM, win);
+#endif
+#if 0
+					MPI_Get(rbuf + j % szbuf, 1, MPI_DOUBLE,
+							i,
+							j % szbuf, 1, MPI_DOUBLE,
+							win);
+#endif
+				}
+			}
+			fwq(calc_cyc * nsamples);
+			MPI_Win_flush_local_all(win);
+			end = rdtsc_light();
+
+			//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+			MPI_Win_unlock_all(win);
+			MPI_Barrier(MPI_COMM_WORLD);
+			t_pure_l = (end - start) / nsamples;
+			//t_pure_l = DIFFNSEC(end, start) / nsamples;
+
+			if (1||m == 2) {
+				MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+				if (my_rank == 0) {
+					if (l == 0) {
+						printf("async: %d, trial: %d\n", k, m);
+					}
+					if (k == 0) { 
+						printf("%ld\t%ld\n", calc_cyc, t_pure);
+					} else {
+						printf("%ld\n", t_pure);
+					}
+				}
+			}
+
+			if (k == 0 && l == 0) {
+				t_pure0 = t_pure;
+			}
+#if 0
+			for (i = 0; i < nproc; i++) {
+				for (j = 0; j < sbuf; j++) {
+					printf("wbuf,j=%d,val=%f\n", j, wbuf[j]);
+					printf("rbuf,j=%d,val=%f\n", j, rbuf[j]);
+					printf("result,j=%d,val=%f\n", j, result[j]);
+				}
+			}
+#endif
+		}
+		}
+
+		if (k == 1) {
+			FINALIZE_ASYNC_THREAD_();
+			
+#if 0
+			if ((rc = getrusage(RUSAGE_THREAD, &ru_end))) {
+				printf("%s: ERROR: getrusage failed (%d)\n", __FUNCTION__, rc);
+			}
+			
+			if ((rc = gettimeofday(&tv_end, NULL))) {
+				printf("%s: ERROR: gettimeofday failed (%d)\n", __FUNCTION__, rc);
+			}
+			
+			printf("%s: wall: %ld, user: %ld, sys: %ld\n", __FUNCTION__,
+				   DIFFUSEC(tv_end, tv_start),
+				   DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime),
+				   DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime));
+			syscall(701, 4 | 8 | 0x80000000);
+#endif
+		}
+	}
+	
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/014.sh b/test/uti/mpi/014.sh
new file mode 100755
index 00000000..371e3e21
--- /dev/null
+++ b/test/uti/mpi/014.sh
@@ -0,0 +1,191 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/work/gg10/e29005
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+mck_dir=${MYHOME}/project/os/install
+
+exe=`basename $0 | sed 's/\.sh//'`
+
+stop=0
+reboot=0
+go=0
+
+async=0
+mck=0
+nnodes=2
+LASTNODE=8200
+nsamples=100 #2^12-15
+use_hfi=0
+omp_num_threads=1
+ppn=4
+lpp=4 # logical-per-physical
+ncpu_mt=256 # number of CPUs for main-thread
+myasync=1
+async_in_mck=0
+
+while getopts srga:c:n:ml:N:P:ho:A:M: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=$OPTARG
+		;;
+	    n) nsamples=$OPTARG
+		;;
+            m) mck=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    h) use_hfi=1
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+	    A) myasync=$OPTARG
+		;;
+	    M) async_in_mck=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
+
+PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
+
+if [ $mck -eq 0 ] || [ $async_in_mck -eq 1 ]; then
+    export DISABLE_UTI=1
+else
+    unset DISABLE_UTI
+fi
+
+if [ $mck -eq 0 ]; then
+    async_progress_pin=64,65,66,67,132,133,134,135,200,201,202,203,268,269,270,271
+else
+    async_progress_pin=`(for ((i=0;i<ppn;i++)) do printf "%d," $((i * (ncpu_mt / ppn) +1)); done) | sed 's/,$//'`
+    # same tile, different physical core
+fi
+echo async_progress_pin=$async_progress_pin
+
+if [ ${mck} -eq 1 ]; then
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="--uti-thread-rank=$uti_thread_rank"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+    mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
+else
+    mcexec=
+    mcexecopt=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+else
+    # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
+    i_mpi_pin=on
+    if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
+	domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
+    else
+	domain=$((ncpu_mt / ppn)) # Use logical as well
+    fi 
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+fi
+
+if [[ ($async -eq 1  && "$async_progress_pin" != "" ) || $myasync -eq 1 ]]; then
+    i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
+else
+    i_mpi_async_progress_pin=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+cd ${UTI_MPI_TOP}
+(
+cat <<EOF
+#!/bin/sh
+
+export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
+export I_MPI_HYDRA_BOOTSTRAP=ssh
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_domain
+$i_mpi_pin_order
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+export KMP_AFFINITY=granularity=thread,scatter
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=$async
+$i_mpi_async_progress_pin
+export MY_ASYNC_PROGRESS=$myasync
+
+#export I_MPI_STATS=native:20,ipm
+#export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt ./$exe -n $nsamples
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    . /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64
+    cd ${UTI_MPI_TOP}
+    make ./$exe
+    bash -c '. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64; ./job.sh'
+fi
diff --git a/test/uti/mpi/015.c b/test/uti/mpi/015.c
new file mode 100755
index 00000000..cde43202
--- /dev/null
+++ b/test/uti/mpi/015.c
@@ -0,0 +1,346 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <getopt.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include "async_progress.h"
+
+//#define DEBUG
+#ifdef DEBUG
+#define dprintf printf
+#else
+#define dprintf {}
+#endif
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+
+#define BEGIN_EPOCH(win) do { MPI_Win_lock_all(0, win); } while(0)
+#define END_EPOCH(win) do { MPI_Win_unlock_all(win); } while(0)
+
+static inline uint64_t rdtsc_light(void )
+{
+    uint64_t x;
+    __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */
+                         "shl $32, %%rdx;"
+                         "or %%rdx, %%rax" :
+                         "=a"(x) :
+                         :    
+                         "%rcx", "%rdx", "memory");
+    return x;
+}
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+long cyc, cycpw; /* cycles per work */
+
+void fwq_init() {
+	long start, end;
+	int i;
+	start = rdtsc_light();
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	end = rdtsc_light();
+	cyc = end - start;
+	cycpw = cyc / (double)N_INIT;
+}
+
+#if 0
+void fwq(long delay_cyc) {
+	if (delay_cyc < 0) { 
+        return;
+		//printf("%s: delay_cyc < 0\n", __FUNCTION__);
+	}
+	bulk_fsw(delay_cyc / cycpw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_cyc) {
+	long start, end;
+	
+	if (delay_cyc < 0) { return; }
+	start = rdtsc_light();
+
+	while (1) {
+		end = rdtsc_light();
+		if (end - start >= delay_cyc) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+static int print_cpu_last_executed_on() {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("compute thread,pmi_rank=%02d,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", atoi(getenv("PMI_RANK")), atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
+/* ga_acc per rank:ga_sync=40:1 */
+void rma(int nproc, int my_rank, double *wbuf, double *rbuf, int ndoubles, MPI_Win win, long calc_nsec) {
+	int i, j;
+	int r = 0, s = 0;
+	int req = 0;
+	for (i = 0; i < nproc; i++) {
+		if (i != my_rank) {
+			for (j = 0; j < ndoubles; j++) {
+				MPI_Accumulate(rbuf + i * ndoubles + j, 1, MPI_DOUBLE,
+							   i, i * ndoubles + j, 1, MPI_DOUBLE,
+							   MPI_SUM, win);
+				MPI_Win_flush_local(i, win); /* ga_acc() calls flush_local() immediately */
+			}
+		}
+	}
+	fwq(calc_nsec);
+}
+
+static struct option options[] = {
+	{
+		.name =		"ppn",
+		.has_arg =	required_argument,
+		.flag =		NULL,
+		.val =		'P',
+	},
+	/* end */
+	{ NULL, 0, NULL, 0, },
+};
+
+int main(int argc, char **argv) {
+	int rc;
+    int actual;
+	int ppn = -1;
+	int nproc;
+    int ndoubles = -1;
+	double add_rate = 1.0;
+	int my_rank = -1, size = -1;
+	int i, j, k, l;
+	double *wbuf, *rbuf, *result;
+	MPI_Win win;
+    long start, end;
+    //struct timespec start, end;
+	long t_pure_l, t_overall_l;
+	long t_pure, t_overall;
+	int opt;
+ 
+	fwq_init();
+
+	while ((opt = getopt_long(argc, argv, "+d:P:R:", options, NULL)) != -1) {
+		switch (opt) {
+			case 'd':
+				ndoubles = atoi(optarg);
+				break;
+			case 'P':
+				ppn = atoi(optarg);
+				break;
+		case 'R':
+			add_rate = atof(optarg);
+			break;
+			default: /* '?' */
+				printf("unknown option %c\n", optopt);
+				exit(1);
+		}
+	}
+
+	if (ndoubles == -1 || ppn == -1) {
+		printf("specify ndoubles with -d and ppn with --ppn");
+		exit(1);
+	}
+
+    MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != 3) {
+		printf("ERROR: MPI_THREAD_MULTIPLE not available (level was set to %d)\n", actual);
+		exit(1);
+	}
+
+    MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
+    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (my_rank == 0) {
+		printf("ndoubles=%d,nproc=%d,add_rate=%f\n", ndoubles, nproc, add_rate); 
+		printf("cyc=%ld, cycpw=%ld\n", cyc, cycpw);
+	}
+
+	/* accumulate-to buffer */
+	wbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!wbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(wbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* read-from buffer */
+	rbuf = malloc(sizeof(double) * ndoubles * nproc);
+	if(!rbuf) { printf("malloc failed"); goto fn_fail; }
+	memset(rbuf, 0, sizeof(double) * ndoubles * nproc);
+
+	/* Expose accumulate-to buffer*/
+	if (rc = MPI_Win_create(wbuf, sizeof(double) * ndoubles * nproc, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win)) {
+		printf("MPI_Win_create failed,rc=%d\n", rc);
+	}
+
+	//print_cpu_last_executed_on();
+
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			wbuf[i * ndoubles + j] = (i + 1) * 1000 + (j + 1);
+			rbuf[i * ndoubles + j] = (i + 1) * 10000 + (j + 1);
+		}
+	}
+	
+#if 0
+	for (i = 0; i < nproc; i++) {
+		for (j = 0; j < ndoubles; j++) {
+			printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+			printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+		}
+    }
+#endif	
+
+	for (k = 0; k < 2; k++) {
+		if (k == 1) {
+			INIT_ASYNC_THREAD_();
+		}
+
+		/* Measure get_acc-flush time */
+		MPI_Barrier(MPI_COMM_WORLD);
+#define NPURE 10
+		//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &start);
+		start = rdtsc_light();
+		MPI_Pcontrol(1, "rma");
+		syscall(701, 1);
+		syscall(701, 2);
+		for (i = 0; i < NPURE; i++) {
+			BEGIN_EPOCH(win);
+			rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 0);
+			END_EPOCH(win);
+		}
+		MPI_Pcontrol(-1, "rma");
+		syscall(701, 4);
+		syscall(701, 8);
+		end = rdtsc_light();
+		//clock_gettime(CLOCK_THREAD_CPUTIME_ID, &end);
+		MPI_Barrier(MPI_COMM_WORLD);
+		t_pure_l = (end - start) / NPURE;
+		//t_pure_l = DIFFNSEC(end, start) / NPURE;
+		//printf("t_pure (local): %ld usec\n", t_pure_l / 1000UL);
+		MPI_Allreduce(&t_pure_l, &t_pure, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+		if (my_rank == 0) printf("t_pure (max): %ld cycles\n", t_pure);
+		
+
+#if 1
+		for (l = 1; l <= 10; l++) {
+			MPI_Barrier(MPI_COMM_WORLD);
+#define NOVERALL 10
+			start = rdtsc_light();
+			for (i = 0; i < NOVERALL; i++) {
+				BEGIN_EPOCH(win);
+				rma(nproc, my_rank, wbuf, rbuf, ndoubles, win, 100UL * 1000000 * l);
+				END_EPOCH(win);
+			}
+			end = rdtsc_light();
+			MPI_Barrier(MPI_COMM_WORLD);
+			t_overall_l = (end - start) / NOVERALL;
+			MPI_Allreduce(&t_overall_l, &t_overall, 1, MPI_LONG, MPI_MAX, MPI_COMM_WORLD);
+			if (my_rank == 0) printf("t_overall (max): %ld cycle\n", t_overall);
+		}
+#endif
+			
+		if (k == 1) {
+			FINALIZE_ASYNC_THREAD_();
+		}
+		
+#if 0
+		for (i = 0; i < nproc; i++) {
+			for (j = 0; j < ndoubles; j++) {
+				printf("wbuf,proc=%d,j=%d,val=%f\n", i, j, wbuf[i * ndoubles + j]);
+				printf("rbuf,proc=%d,j=%d,val=%f\n", i, j, rbuf[i * ndoubles + j]);
+				printf("result,proc=%d,j=%d,val=%f\n", i, j, result[i * ndoubles + j]);
+			}
+		}
+#endif
+	}
+	
+ fn_exit:
+    MPI_Finalize();
+	return 0;
+ fn_fail:
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/015.sh b/test/uti/mpi/015.sh
new file mode 100755
index 00000000..719cd6ba
--- /dev/null
+++ b/test/uti/mpi/015.sh
@@ -0,0 +1,189 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/work/gg10/e29005
+UTI_MPI_TOP=${MYHOME}/project/os/mckernel/test/uti/mpi
+
+mck_dir=${MYHOME}/project/os/install
+
+exe=`basename $0 | sed 's/\.sh//'`
+
+stop=0
+reboot=0
+go=0
+
+async=0
+mck=0
+nnodes=2
+LASTNODE=8200
+ndoubles=16 #2^12-15
+add_rate="1.0"
+disable_uti=0
+omp_num_threads=1
+ppn=16 #16
+async_progress_pin=64,132,200,268,65,133,201,269,66,134,202,270,67,135,203,271
+lpp=4 # logical-per-physical
+ncpu_mt=256 # number of CPUs for main-thread
+myasync=1
+use_hfi=0
+
+while getopts srga:c:n:md:l:N:P:o:A:R: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+	    a) async=$OPTARG
+		;;
+	    n) ndoubles=$OPTARG
+		;;
+            m) mck=1
+                ;;
+            d) disable_uti=$OPTARG
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+	    A) myasync=$OPTARG
+		;;
+	    R) add_rate=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes
+
+PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
+
+if [ $disable_uti -eq 1 ]; then
+    export DISABLE_UTI=1
+else
+    unset DISABLE_UTI
+fi
+
+if [ ${mck} -eq 1 ]; then
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="--uti-thread-rank=$uti_thread_rank"
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+    mcexecopt="-n $ppn -t $nmcexecthr $mcexecopt"
+else
+    mcexec=
+    mcexecopt=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+else
+    # Let each domain have all logical cores and use KMP_AFFINITY=scatter if you want to use only physical cores
+    i_mpi_pin=on
+    if [ $((omp_num_threads * lpp * ppn)) -le $ncpu_mt ]; then
+	domain=$((omp_num_threads * lpp)) # Prefer physical but adjacent physicals share L1
+    else
+	domain=$((ncpu_mt / ppn)) # Use logical as well
+    fi 
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+fi
+
+if [[ ($async -eq 1  && "$async_progress_pin" != "" ) || $myasync -eq 1 ]]; then
+    i_mpi_async_progress_pin="export I_MPI_ASYNC_PROGRESS_PIN=$async_progress_pin"
+else
+    i_mpi_async_progress_pin=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+	else
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -s -c 1-15,65-79,129-143,193-207,17-31,81-95,145-159,209-223,33-47,97-111,161-175,225-239,49-63,113-127,177-191,241-255 -r 1-15:0+65-79:64+129-143:128+193-207:192+17-31:16+81-95:80+145-159:144+209-223:208+33-47:32+97-111:96+161-175:160+225-239:224+49-63:48+113-127:112+177-191:176+241-255:240 -m 12G@0,12G@1,12G@2,12G@3,3920M@4,3920M@5,3920M@6,3920M@7
+	fi
+    else
+	:
+    fi
+fi
+
+cd ${UTI_MPI_TOP}
+(
+cat <<EOF
+#!/bin/sh
+
+export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh
+export I_MPI_HYDRA_BOOTSTRAP=ssh
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_domain
+$i_mpi_pin_order
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+export KMP_AFFINITY=granularity=thread,scatter
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=$async
+$i_mpi_async_progress_pin
+export MY_ASYNC_PROGRESS=$myasync
+
+#export I_MPI_STATS=native:20,ipm
+#export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+mpiexec.hydra -l -n $nprocs -ppn $ppn -hosts $nodes $ilpopt $mcexec $mcexecopt ./$exe --ppn $ppn -d $ndoubles -R $add_rate
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    cd ${UTI_MPI_TOP}
+    if [ $mck -eq 1 ]; then
+	make $exe
+    else
+	. /home/opt/local/cores/intel/compilers_and_libraries_2018.1.163/linux/bin/compilervars.sh intel64
+	make CC=mpiicc $exe
+    fi
+    ./job.sh
+fi
+
+
+
diff --git a/test/uti/mpi/016.c b/test/uti/mpi/016.c
new file mode 100755
index 00000000..fc83c198
--- /dev/null
+++ b/test/uti/mpi/016.c
@@ -0,0 +1,349 @@
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <mpi.h>
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <getopt.h>
+#include <sched.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include "async_progress.h"
+#include "util.h"
+
+#define MYTIME_UNIT "usec"
+#define MYTIME_TOUSEC 1000000
+#define MYTIME_TONSEC 1000000000
+
+#define NROW 16 /* 0%, 10%, ..., 140% */
+#define NCOL 4
+
+#define NSAMPLES_DROP 5/*10*/
+#define NSAMPLES_COMM 10/*20*/
+#define NSAMPLES_TOTAL 10/*20*/
+#define NSAMPLES_INNER 5
+
+#define PROGRESS_CALC_PHASE_ONLY
+
+static inline double mytime() {
+	return /*rdtsc_light()*/MPI_Wtime();
+}
+
+static int ppn = -1;
+
+void init_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int id) {
+	int j;
+	for (j = 0; j < szbuf; j++) {
+		origin_buf[j] = (rank + 1) * 100.0 + (j + 1);
+		result[j] = (id + 1) * 100000000.0 + (rank + 1) * 10000.0 + (j + 1);
+		target_buf[j] = (rank + 1) * 1000000.0 + (j + 1);
+	}
+}	
+
+void pr_buf(double *origin_buf, double *result, double *target_buf, int szbuf, int rank, int nproc) {
+	int i, j;
+	for (i = 0; i < nproc; i++) {
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		if (i != rank) {
+			usleep(100000);
+			continue;
+		}
+
+		for (j = 0; j < szbuf; j++) {
+			pr_debug("[%d] origin_buf,j=%d,val=%f\n", rank, j, origin_buf[j]);
+			pr_debug("[%d] result,j=%d,val=%f\n", rank, j, result[j]);
+			pr_debug("[%d] target_buf,j=%d,val=%f\n", rank, j, target_buf[j]);
+		}
+	}
+}
+
+void rma(int rank, int nproc, MPI_Win win, double *origin_buf, double *result, int szbuf, long nsec_calc, int async_progress, int sync_progress, double pct_calc) {
+	int i, j, target_rank;
+	int completed, ret;
+
+	for (j = 0; j < NSAMPLES_INNER; j++) {
+		for (i = 1; i < nproc; i++) {
+			target_rank = (rank + i) % nproc;
+			
+			MPI_Get_accumulate(origin_buf, szbuf, MPI_DOUBLE,
+					   result, szbuf, MPI_DOUBLE,
+					   target_rank,
+					   0, szbuf, MPI_DOUBLE,
+					   MPI_NO_OP, win);
+#if 0
+			if (sync_progress) {
+				if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+					pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
+				}
+			}
+#endif
+		}
+	}
+	
+	if (async_progress) {
+#ifdef PROGRESS_CALC_PHASE_ONLY
+		progress_start();
+#endif
+	}
+
+	ndelay(nsec_calc);
+
+	if (async_progress) {
+#ifdef PROGRESS_CALC_PHASE_ONLY
+		progress_stop();
+#endif
+	}
+
+#define MAX2(x,y) ((x) > (y) ? (x) : (y))
+
+#if 1
+	/* iprobe is 10 times faster than win_flush_local_all,
+	   20679 usec / (8*63*5) messages for 8-ppn 8-node case */
+	if (1/*!sync_progress*/)
+		for (j = 0; j < (async_progress ? MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc),  nproc - 1) : NSAMPLES_INNER * (nproc - 1)); j++) {
+			//for (j = 0; j < MAX2(NSAMPLES_INNER * (nproc - 1) * (1.0 - pct_calc),  nproc - 1); j++) {
+			if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+				pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
+			}
+		}
+#endif
+
+	MPI_Win_flush_local_all(win);
+}
+
+double measure(int rank, int nproc, MPI_Win win, double *origin_buf, double* result, double *target_buf, int szbuf, long nsec_calc, int async_progress, int sync_progress, int nsamples, int nsamples_drop, double pct_calc) {
+	int i;
+	double t_l, t_g, t_sum = 0;
+	double start, end;
+
+	for (i = 0; i < nsamples + nsamples_drop; i++) {
+		MPI_Barrier(MPI_COMM_WORLD);
+		MPI_Win_lock_all(0, win);
+		
+		/* Set parameter based on current IPC and frequency */
+		ndelay_init(0);
+
+		start = mytime();
+		rma(rank, nproc, win, origin_buf, result, szbuf, nsec_calc, async_progress, sync_progress, pct_calc);
+		end = mytime();
+		
+		MPI_Win_unlock_all(win);
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		t_l = end - start;
+		MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+
+		if (i < nsamples_drop) {
+			continue;
+		}
+
+		t_sum += t_g;
+	}
+	return t_sum / nsamples;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	int actual;
+	int rank = -1;
+	int nproc;
+	int i, j, progress, l, m;
+	double *target_buf, *origin_buf, *result;
+	MPI_Win win;
+	double t_comm_l, t_comm_g, t_comm_sum, t_comm_ave;
+	double t_total_l, t_total_g, t_total_sum, t_total_ave;
+	double t_table[NROW][NCOL];
+	int opt;
+	int szbuf = 1; /* Number of doubles to send */
+	struct rusage ru_start, ru_end;
+	struct timeval tv_start, tv_end;
+	int disable_syscall_intercept = 0;
+ 
+	cpu_set_t cpuset;
+
+	//test_set_loglevel(TEST_LOGLEVEL_WARN);	
+	ndelay_init(1);
+
+	while ((opt = getopt(argc, argv, "+p:I:")) != -1) {
+		switch (opt) {
+		case 'p':
+			ppn = atoi(optarg);
+			break;
+		case 'I':
+			disable_syscall_intercept = atoi(optarg);
+			break;
+		default: /* '?' */
+			printf("unknown option %c\n", optopt);
+			ret = -1;
+			goto out;
+		}
+	}
+
+	if (ppn == -1) {
+		pr_err("Error: Specify processes-per-rank with -p");
+		ret = -1;
+		goto out;
+	}
+
+	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &actual);
+	if (actual != MPI_THREAD_MULTIPLE) {
+		pr_err("Error: MPI_THREAD_MULTIPLE is not available\n");
+		ret = -1;
+		goto out;
+	}
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	if (rank == 0) {
+		printf("ndoubles=%d,nproc=%d\n", szbuf, nproc); 
+
+#pragma omp parallel
+		{
+			//printf("%d cpu\n", sched_getcpu());
+			if (omp_get_thread_num() == 0) {
+				printf("#threads=%d\n", omp_get_num_threads());
+			}
+		}
+	}
+
+	/* accumulate-to buffer */
+	target_buf = malloc(sizeof(double) * szbuf);
+	if (!target_buf) {
+		pr_err("Error: allocating target_buf");
+		ret = -1;
+		goto out;
+	}
+	memset(target_buf, 0, sizeof(double) * szbuf);
+
+	/* read-from buffer */
+	origin_buf = malloc(sizeof(double) * szbuf);
+	if (!origin_buf) {
+		pr_err("Error: alloacting origin_buf");
+		ret = -1;
+		goto out;
+	}
+	memset(origin_buf, 0, sizeof(double) * szbuf);
+
+	/* fetch-to buffer */
+	result = malloc(sizeof(double) * szbuf);
+	if (!result) {
+		pr_err("Error: allocating result");
+		ret = -1;
+		goto out;
+	}
+	memset(result, 0, sizeof(double) * szbuf);
+
+	/* Expose accumulate-to buffer*/
+	ret = MPI_Win_create(target_buf, sizeof(double) * szbuf, sizeof(double), MPI_INFO_NULL, MPI_COMM_WORLD, &win);
+	if (ret != 0) {
+		pr_err("Error: MPI_Win_create returned %d\n", ret);
+		ret = -1;
+		goto out;
+	}
+
+	/* Measure RMA-only time */
+	init_buf(origin_buf, result, target_buf, szbuf, rank, 99);
+	t_comm_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, 0, 0, 1, NSAMPLES_COMM, NSAMPLES_DROP, 0);
+
+	if (rank == 0) {
+		printf("t_comm_ave: %.0f %s\n", t_comm_ave * MYTIME_TOUSEC, MYTIME_UNIT);
+	}
+
+#ifdef PROFILE	
+	syscall(701, 1 | 2 | 0x80000000); /* syscall profile start */
+#endif
+
+	/* 0: no progress, 1: progress, no uti, 2: progress, uti */
+	for (progress = 0; progress <= (disable_syscall_intercept ? 0 : 2); progress += 1) {
+
+		if (progress == 1) {
+			setenv("DISABLE_UTI", "1", 1); /* Don't use uti_attr and pin to Linux/McKernel CPUs */
+			progress_init();
+		} else if (progress == 2) {
+			progress_finalize();
+			unsetenv("DISABLE_UTI");
+			progress_init();
+		}
+
+		if (progress == 1 || progress == 2) {
+#ifndef PROGRESS_CALC_PHASE_ONLY
+			//progress_start();
+#endif
+		}
+
+		/* RMA-start, compute for T_{RMA} * l / 10, RMA-flush */
+		for (l = 0; l <= NROW - 1; l += 1) {
+			long nsec_calc = (t_comm_ave * MYTIME_TONSEC * l) / 10;
+
+			init_buf(origin_buf, result, target_buf, szbuf, rank, l);
+			//pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc);
+			t_total_ave = measure(rank, nproc, win, origin_buf, result, target_buf, szbuf, nsec_calc, progress, 0, NSAMPLES_TOTAL, NSAMPLES_DROP, l / 10.0);
+			//pr_buf(origin_buf, result, target_buf, szbuf, rank, nproc);
+
+			if (rank == 0) {
+
+				if (l == 0) {
+					pr_debug("progress=%d\n", progress);
+					if (progress == 0) { 
+						pr_debug("calc\ttotal\n");
+					} else {
+						pr_debug("total\n");
+					}
+				}
+
+				t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC);
+				if (progress == 0) { 
+					pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				} else {
+					pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				}
+			}
+		}
+
+		if (progress == 1 || progress == 2) {
+#ifndef PROGRESS_CALC_PHASE_ONLY
+			//progress_stop();
+#endif
+		}
+
+	}
+	
+#ifdef PROFILE
+	syscall(701, 4 | 8 | 0x80000000); /* syscall profile report */
+#endif
+
+	if (rank == 0) {
+		printf("calc,no prog,prog and no uti, prog and uti\n");
+		for (l = 0; l <= NROW - 1; l++) {
+			for (i = 0; i < NCOL; i++) {
+				if (i > 0) {
+					printf(",");
+				}
+				printf("%.0f", t_table[l][i]);
+			}
+			printf("\n");
+		}
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+
+	if (progress >= 1) {
+		progress_finalize();
+	}
+
+	MPI_Finalize();
+	ret = 0;
+out:
+	return ret;
+}
diff --git a/test/uti/mpi/016.sh b/test/uti/mpi/016.sh
new file mode 100755
index 00000000..90d87107
--- /dev/null
+++ b/test/uti/mpi/016.sh
@@ -0,0 +1,272 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/home/e29005
+test_dir=`pwd -P`
+mck_dir=${MYHOME}/project/os/install
+uti_dir_lin=${MYHOME}/project/uti/install_linux
+uti_dir_mck=${MYHOME}/project/uti/install_mckernel
+
+exe=`basename $0 | sed 's/\.sh//'`
+
+stop=0
+reboot=0
+go=0
+
+interactive=0
+pjsub=0
+gdb=0
+disable_syscall_intercept=0
+mck=0
+nnodes=2
+LASTNODE=8196
+use_hfi=0
+omp_num_threads=32
+ppn=4
+
+while getopts srgc:ml:N:P:o:hGI:ipL: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+            m) mck=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+	    h) use_hfi=1
+		;;
+	    G) gdb=1
+		;;
+	    I) disable_syscall_intercept=$OPTARG
+		;;
+	    i) interactive=1
+		;;
+	    p) pjsub=1
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+
+# vertical cut, excluding phys loaded with Linux tasks
+uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223
+exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223
+#64-67,132-135,200-203,268-271 
+
+uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223
+
+# horizontal cut, excluding phys loaded with Linux tasks for mckernel
+#uti_cpu_set_lin=204-271 
+#uti_cpu_set_mck=1-67
+
+if [ $mck -eq 0 ]; then
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin"
+    i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list"
+else
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck"
+    i_mpi_pin_processor_exclude_list=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+#    if [ $omp_num_threads -eq 1 ]; then
+#	# Avoid binding main thread and uti thread to one CPU
+	kmp_affinity="export KMP_AFFINITY=disabled" 
+#    else
+#	# Bind rank to OMP_NUM_THREAD-sized CPU-domain
+#	kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+#    fi
+else
+    i_mpi_pin=on
+    domain=$omp_num_threads # Use 32 when you want to match mck's -n division
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+    kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+fi
+
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes domain=$domain
+
+if [ ${mck} -eq 1 ]; then
+    makeopt="UTI_DIR=$uti_dir_mck"
+    use_mck="#PJM -x MCK=$mck_dir"
+    mck_mem="#PJM -x MCK_MEM=32G@0,8G@1"
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="-n $ppn --uti-use-last-cpu" # -t $nmcexecthr
+
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+
+    if [ $disable_syscall_intercept -eq 0 ]; then
+	mcexecopt="--enable-uti $mcexecopt"
+    fi
+
+else
+    offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu \| grep Off 2>&1 | dshbak -c | grep Off`
+    if [ "$offline" != "" ]; then
+	echo "Error: Some CPUs are offline: $offline"
+	exit
+    fi
+
+    makeopt="UTI_DIR=$uti_dir_lin"
+    use_mck=
+    mck_mem=
+    mcexec=
+    mcexecopt=
+fi
+
+if [ $gdb -eq 1 ]; then
+    enable_x="-enable-x"
+    gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args"
+fi
+
+if [ $interactive -eq 1 ]; then
+    i_mpi_hydra_bootstrap_exec=
+    i_mpi_hydra_bootstrap=
+    hosts=
+    opt_dir=/opt/intel
+    ssh=
+else
+#    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
+    i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh"
+    i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh"
+    hosts="-hosts $nodes"
+    opt_dir=/home/opt/local/cores/intel
+    ssh="ssh -A c$LASTNODE"
+fi
+
+# If using ssh
+# Latest versions are: 1.163, 2.199, 3.222
+if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then
+    compilervars=". ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64"
+else
+    compilervars=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof mcexec \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcstop+release.sh
+    else
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+
+	    # -h: Hide idle thread to prevent KNL CPU from mux-ing resource and halving throughput 
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+
+# perl -e 'for ($i=0;$i<68;$i++){if($i>0){print "+";}printf("%d,%d,%d:%d", $i+68,$i+136,$i+204,$i);}'
+
+#	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+#	    sudo ${mck_dir}/sbin/mcreboot.sh -O -c 68-271 -r 68,136,204:0+69,137,205:1+70,138,206:2+71,139,207:3+72,140,208:4+73,141,209:5+74,142,210:6+75,143,211:7+76,144,212:8+77,145,213:9+78,146,214:10+79,147,215:11+80,148,216:12+81,149,217:13+82,150,218:14+83,151,219:15+84,152,220:16+85,153,221:17+86,154,222:18+87,155,223:19+88,156,224:20+89,157,225:21+90,158,226:22+91,159,227:23+92,160,228:24+93,161,229:25+94,162,230:26+95,163,231:27+96,164,232:28+97,165,233:29+98,166,234:30+99,167,235:31+100,168,236:32+101,169,237:33+102,170,238:34+103,171,239:35+104,172,240:36+105,173,241:37+106,174,242:38+107,175,243:39+108,176,244:40+109,177,245:41+110,178,246:42+111,179,247:43+112,180,248:44+113,181,249:45+114,182,250:46+115,183,251:47+116,184,252:48+117,185,253:49+118,186,254:50+119,187,255:51+120,188,256:52+121,189,257:53+122,190,258:54+123,191,259:55+124,192,260:56+125,193,261:57+126,194,262:58+127,195,263:59+128,196,264:60+129,197,265:61+130,198,266:62+131,199,267:63+132,200,268:64+133,201,269:65+134,202,270:66+135,203,271:67 -m 32G@0,12G@1
+	else
+	    echo "unkwon host type"
+	    exit 1
+	fi
+    else
+	:
+    fi
+fi
+
+(
+cat <<EOF
+#!/bin/sh
+
+#PJM -L rscgrp=$rg
+#PJM -L node=$nnodes
+#PJM --mpi proc=$nprocs
+#PJM -L elapse=$elapse
+#PJM -L proc-crproc=16384 
+#PJM -g gg10
+#PJM -j
+#PJM -s
+$use_mck
+$mck_mem
+
+$i_mpi_hydra_bootstrap_exec
+$i_mpi_hydra_bootstrap
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+$uti_cpu_set_str
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_processor_exclude_list
+$i_mpi_pin_domain
+$i_mpi_pin_order
+$kmp_affinity
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=off
+
+#export I_MPI_STATS=native:20,ipm
+#export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+ulimit -c unlimited 
+
+$compilervars
+mpiexec.hydra -n $nprocs -ppn $ppn $hosts $ilpopt $enable_x $gdbcmd $mcexec $mcexecopt ${test_dir}/$exe -p $ppn -I $disable_syscall_intercept
+#-l
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    if [ $pjsub -eq 1 ]; then
+	pjsub ./job.sh
+    else
+	if [ $interactive -eq 0 ]; then
+	    . ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64
+	fi
+	rm ./$exe
+	make $makeopt ./$exe
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+	$ssh ${test_dir}/job.sh
+    fi
+fi
diff --git a/test/uti/mpi/Makefile b/test/uti/mpi/Makefile
new file mode 100755
index 00000000..43418165
--- /dev/null
+++ b/test/uti/mpi/Makefile
@@ -0,0 +1,56 @@
+.SUFFIXES:	# Clear suffixes
+
+MYHOME=/home/e29005
+
+# Specify it via 016.sh 
+UTI_DIR=${MYHOME}/project/uti/install_linux
+
+CC=mpiicc 
+LD=$(CC)
+
+CFLAGS = -g -O0 -Wall
+LDFLAGS = -lpthread -lpsm2 -L$(UTI_DIR)/lib -Wl,-rpath -Wl,$(UTI_DIR)/lib -luti
+SRCS = $(shell ls *.c)
+OBJS = $(SRCS:.c=.o)
+EXES = $(SRCS:.c=)
+TMPFILES = $(shell ls psm2-demo-* 2>/dev/null)
+
+all: $(EXES) file
+
+file: $(TMPFILES)
+	rm -f $(TMPFILES)
+	dd if=/dev/zero of=./file  bs=1M  count=1
+
+async_progress.o:: async_progress.c util.h
+	$(CC) $(CFLAGS) -I$(UTI_DIR)/include -c $<
+
+util.o:: util.c util.h
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+014: 014.o async_progress.o util.o
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+015: 015.o async_progress.o
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+016: 016.o async_progress.o util.o
+	$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
+
+016.o::016.c
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+011: 011.o
+	$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
+
+011.o::011.c
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+%: %.o
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+%.o::%.c
+	$(CC) $(CFLAGS) -c $<
+
+clean:
+	rm -f core $(EXES) $(OBJS) $(DSRCS)
+
diff --git a/test/uti/mpi/README b/test/uti/mpi/README
new file mode 100644
index 00000000..592b6d56
--- /dev/null
+++ b/test/uti/mpi/README
@@ -0,0 +1,25 @@
+001 isend 送受信に使用するバッファは毎回異なる
+002 barrier
+003 isend 送受信に使用するバッファは一つ、waitの前にsleepしない
+004 isend-calc-wait, all-to-all
+005 lockall-accumulate-calc-unlockall, all-to-all
+006 parent isend-calc-wait, child does nothing --> crash
+007 parent isend-calc-wait, child psm2 send/recv --> one ep per process
+008 parent psm2-init and psm2-connect, child psm2-send/recv --> receiver side crash
+009 parent does nothing, child psm2-init, psm2-connect, psm2-send/recv --> receiver side crash
+010 parent psm2-init, psm2-connect, psm2-send/recv, child does nothing
+011 001にopenmpスレッドを追加
+012 get_acc-calc-flush_local_all, all-to-all. Execute ./012.sh
+013 acc-flush_local-calc, all-to-all, acc:flush_local=1:1
+014 012 + async progress thread. 
+015 013 + async progress thread
+
+016 MPI_Get_accumulate()のオーバーラップ
+
+* 通信パターンは全対全、
+* CPUはいくつかをprogress thread専用に割く
+* ステップは以下の通り
+  (1) MPI_Get_accumulate()
+  (2) MPI_Get_accumulate()とMPI_Flush_local_all()だけを行った場合の
+  　　時間の0.i倍の計算を実行
+  (3) MPI_Flush_local_all()
diff --git a/test/uti/mpi/async_progress.c b/test/uti/mpi/async_progress.c
new file mode 100644
index 00000000..3034ee28
--- /dev/null
+++ b/test/uti/mpi/async_progress.c
@@ -0,0 +1,530 @@
+#define _GNU_SOURCE             /* See feature_test_macros(7) */
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <pthread.h>
+#include <mpi.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <uti.h>
+#include "util.h"
+#include "async_progress.h"
+
+//#define PROFILE
+
+#define STOP_BY_MPI 0
+#define STOP_BY_MEM 1
+#define STOP_TYPE STOP_BY_MEM/*STOP_BY_MPI*/
+
+#define POLL_BY_PROBE 0
+#define POLL_BY_WAIT 1
+#define POLL_BY_TEST 2
+#define POLL_TYPE POLL_BY_PROBE/*POLL_BY_WAIT*/
+
+static int progress_rank, progress_world_rank, progress_world_nproc;
+static pthread_t progress_thr;
+static pthread_mutex_t progress_mutex;
+static pthread_cond_t progress_cond_down;
+static volatile int progress_flag_up, progress_flag_down;
+
+static enum progress_state progress_state;
+static int progress_stop_flag;
+static MPI_Comm progress_comm;
+static int progress_refc;
+#define WAKE_TAG 100
+
+#define NROW_STAT 10
+#define NRANK_STAT 1
+#define RECORD_STAT(count, array, end, start) do { \
+	if (count < NROW_STAT) { \
+		array[count++] += (end - start);	\
+	} \
+} while(0)
+
+static int cyc_prog1_count, cyc_prog2_count, cyc_init1_count, cyc_init2_count, cyc_start_count, cyc_stop1_count, cyc_stop2_count, cyc_stop3_count, cyc_finalize_count;
+static unsigned long cyc_prog1[NROW_STAT];
+static unsigned long cyc_prog2[NROW_STAT];
+static unsigned long cyc_init1[NROW_STAT];
+static unsigned long cyc_init2[NROW_STAT];
+static unsigned long cyc_start[NROW_STAT];
+static unsigned long cyc_stop1[NROW_STAT];
+static unsigned long cyc_stop2[NROW_STAT];
+static unsigned long cyc_stop3[NROW_STAT];
+static unsigned long cyc_finalize[NROW_STAT];
+
+#define MIN2(x,y) ((x) < (y) ? (x) : (y))
+
+void pr_stat(char *name, int count, unsigned long *array) {
+	int i;
+
+	pr_debug("[%d] %s: ", progress_world_rank, name);
+	for (i = 0; i < MIN2(count, NROW_STAT); i++) {
+		if (i > 0) pr_debug(",");
+		pr_debug("%ld", array[i]);
+	}
+	pr_debug("\n");
+}
+
+static void *progress_fn(void* data)
+{
+	int ret;
+	MPI_Request req;
+	struct rusage ru_start, ru_end;
+	struct timeval tv_start, tv_end;
+	unsigned long start, end;
+
+#if 0
+	ret = syscall(732);
+	if (ret == -1) {
+		pr_debug("Progress is running on Linux\n");
+	} else {
+		pr_debug("Progress is running on McKernel\n");
+	}
+
+	if ((ret = getrusage(RUSAGE_THREAD, &ru_start))) {
+		pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
+	}
+
+	if ((ret = gettimeofday(&tv_start, NULL))) {
+		pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
+	}
+
+#endif
+
+#if STOP_TYPE == STOP_BY_MEM && POLL_TYPE == POLL_BY_TEST
+
+	if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
+	}
+
+#endif
+
+init:
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+	/* Wait for state transition */
+	pthread_mutex_lock(&progress_mutex);
+	while (!progress_flag_down) {
+		pthread_cond_wait(&progress_cond_down, &progress_mutex);
+	}
+	progress_flag_down = 0;
+
+	if (progress_state == PROGRESS_FINALIZE) {
+		pthread_mutex_unlock(&progress_mutex);
+		goto finalize;
+	}
+
+	if (progress_state != PROGRESS_START) {
+		pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);	
+		pthread_mutex_unlock(&progress_mutex);
+		goto finalize;
+	}
+
+	pthread_mutex_unlock(&progress_mutex);
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_prog1_count, cyc_prog1, end, start);
+#endif
+
+	//if (progress_world_rank < 2) pr_debug("[%d] poll,cpu=%d\n", progress_world_rank, sched_getcpu());
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+#if STOP_TYPE == STOP_BY_MEM
+
+#if POLL_TYPE == POLL_BY_PROBE
+
+	int completed = 0;
+	while (!progress_stop_flag) {
+		if ((ret = MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+			pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
+			break;
+		}
+		//usleep(1);
+	}
+
+#elif POLL_TYPE == POLL_BY_TEST
+
+	int completed = 0;
+	while (!completed && !progress_stop_flag) {
+		if ((ret = MPI_Test(&req, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+			pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
+			break;
+		}
+		//usleep(1);
+	}
+
+#endif /* POLL_TYPE */
+
+#elif STOP_TYPE == STOP_BY_MPI
+
+
+#if POLL_TYPE == POLL_BY_WAIT
+
+	if ((ret = MPI_Irecv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, &req)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
+	}
+
+	if ((ret = MPI_Wait(&req, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Wait failed (%d)\n", __func__, ret);
+	}
+
+#elif POLL_TYPE == POLL_BY_PROBE
+
+	int completed = 0;
+	while (!completed) {
+		if ((ret = MPI_Iprobe(progress_rank, WAKE_TAG, progress_comm, &completed, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+			pr_err("%s: error: MPI_Iprobe: %d\n", __func__, ret);
+			break;
+		}
+		usleep(1);
+	}
+
+	if ((ret = MPI_Recv(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm, MPI_STATUS_IGNORE)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Irecv: %d\n", __func__, ret);
+	}
+
+#endif /* POLL_TYPE */
+#endif /* STOP_TYPE */
+
+	progress_state = PROGRESS_INIT;
+	__sync_synchronize(); /* st-st barrier */
+	progress_flag_up = 1;
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_prog2_count, cyc_prog2, end, start);
+#endif
+	goto init;
+
+ finalize:
+
+	if ((ret = getrusage(RUSAGE_THREAD, &ru_end))) {
+		pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
+	}
+
+	if ((ret = gettimeofday(&tv_end, NULL))) {
+		pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
+	}
+
+#if 0
+	pr_debug("%s: wall: %ld, user: %ld, sys: %ld\n", __func__,
+		   DIFFUSEC(tv_end, tv_start),
+		   DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime),
+		   DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime));
+#endif
+
+	progress_state = PROGRESS_INIT;
+	__sync_synchronize(); /* st-st barrier */
+	progress_flag_up = 1;
+
+	return NULL;
+}
+
+void progress_init()
+{
+	int ret = 0;
+	pthread_attr_t pthread_attr;
+	uti_attr_t uti_attr;
+	unsigned long start, end;
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+	MPI_Comm_rank(MPI_COMM_WORLD, &progress_world_rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &progress_world_nproc);
+
+	if (__sync_val_compare_and_swap(&progress_refc, 0, 1) == 1) {
+		return;
+	}
+
+	/* printf costs much in MPI */
+	uti_set_loglevel(UTI_LOGLEVEL_ERR);
+
+	if ((ret = MPI_Comm_dup(MPI_COMM_SELF, &progress_comm))) {
+		pr_err("%s: error: MPI_Comm_dup failed (%d)\n", __func__, ret);
+		goto out;
+	}
+
+	MPI_Comm_rank(progress_comm, &progress_rank);
+
+	if ((ret = pthread_mutex_init(&progress_mutex, NULL))) {
+ 		pr_err("%s: error: pthread_mutex_init failed (%d)\n", __func__, ret);
+		goto out;		
+	}
+
+	if ((ret = pthread_cond_init(&progress_cond_down, NULL))) {
+ 		pr_err("%s: error: pthread_cond_init failed (%d)\n", __func__, ret);
+		goto out;		
+	}
+	
+	if ((ret = pthread_attr_init(&pthread_attr))) {
+ 		pr_err("%s: error: pthread_attr_init failed (%d)\n", __func__, ret);
+		goto out;
+	}
+	
+	if ((ret = uti_attr_init(&uti_attr))) {
+ 		pr_err("%s: error: uti_attr_init failed (%d)\n", __func__, ret);
+		goto out;
+	}
+	
+#if 0
+	if ((ret = UTI_ATTR_SAME_L1(&uti_attr))) {
+		pr_err("%s: error: UTI_ATTR_SAME_L1 failed\n", __func__);
+	}
+#endif
+
+#if 1 /* Expecting round-robin binding */
+	if ((ret = UTI_ATTR_CPU_INTENSIVE(&uti_attr))) {
+		pr_err("%s: error: UTI_ATTR_CPU_INTENSIVE failed\n", __func__);
+	}
+
+#endif
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_init1_count, cyc_init1, end, start);
+#endif
+	
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+	if ((ret = uti_pthread_create(&progress_thr, &pthread_attr, progress_fn, NULL, &uti_attr))) {
+		pr_err("%s: error: uti_pthread_create failed (%d)\n", __func__, ret);
+		goto out;
+	}
+
+	ret = 0;
+ out:
+	if (ret) {
+		__sync_fetch_and_sub(&progress_refc, 1);
+	}
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_init2_count, cyc_init2, end, start);
+#endif
+}
+
+void progress_start()
+{
+	unsigned long start, end;
+
+	if (progress_refc == 0) {
+		progress_init();
+	}
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+	pthread_mutex_lock(&progress_mutex);
+
+	if (progress_state == PROGRESS_FINALIZE) {
+		pr_warn("%s: warning: FINALIZE\n", __func__);
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	if (progress_state == PROGRESS_START) {
+		//pr_warn("%s: warning: START\n", __func__);
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	if (progress_state != PROGRESS_INIT) {
+		pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+		
+	progress_state = PROGRESS_START;
+#if STOP_TYPE == STOP_BY_MEM
+	progress_stop_flag = 0;
+#endif
+	__sync_synchronize(); /* memory barrier instruction */
+	progress_flag_down = 1;
+	pthread_cond_signal(&progress_cond_down);
+	pthread_mutex_unlock(&progress_mutex);
+	
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_start_count, cyc_start, end, start);
+#endif
+}
+
+void do_progress_stop()
+{
+	int ret;
+	unsigned long start, end;
+
+	//if (progress_world_rank < 2) pr_debug("[%d] stop,cpu=%d\n", progress_world_rank, sched_getcpu());
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+#if STOP_TYPE == STOP_BY_MEM
+
+	progress_stop_flag = 1;
+        __sync_synchronize(); /* st-st barrier */
+
+#elif STOP_TYPE == STOP_BY_MPI
+
+	if ((ret = MPI_Send(NULL, 0, MPI_CHAR, progress_rank, WAKE_TAG, progress_comm)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Send failed (%d)\n", __func__, ret);
+		return;
+	}
+
+
+#endif /* STOP_TYPE */
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_stop2_count, cyc_stop2, end, start);
+	start = rdtsc_light();
+#endif
+
+	/* Make sure the following command will observe INIT */
+	while (!progress_flag_up) {
+	}
+	progress_flag_up = 0;
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_stop3_count, cyc_stop3, end, start);
+#endif
+}
+
+void progress_stop()
+{
+	unsigned long start, end;
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+	if (progress_refc == 0) {
+		return;
+	}
+
+	pthread_mutex_lock(&progress_mutex);
+
+	if (progress_state == PROGRESS_INIT) {
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	if (progress_state == PROGRESS_FINALIZE) {
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	if (progress_state != PROGRESS_START) {
+		pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	pthread_mutex_unlock(&progress_mutex);
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_stop1_count, cyc_stop1, end, start);
+#endif	
+
+	do_progress_stop();
+}
+
+void progress_finalize()
+{
+	int ret;
+	int i, j;
+	MPI_Request req;
+	unsigned long start, end;
+	int nproc;
+
+	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+#ifdef PROFILE
+	start = rdtsc_light();
+#endif
+
+	if (progress_refc == 0) {
+		return;
+	}
+
+ retry:
+	pthread_mutex_lock(&progress_mutex);
+
+	if (progress_state == PROGRESS_START) {
+		pthread_mutex_unlock(&progress_mutex);
+		do_progress_stop();
+		goto retry;
+	}
+
+	if (progress_state == PROGRESS_FINALIZE) {
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	if (progress_state != PROGRESS_INIT) {
+		pr_err("%s: error: unexpected state: %d\n", __func__, progress_state);
+		pthread_mutex_unlock(&progress_mutex);
+		return;
+	}
+
+	progress_state = PROGRESS_FINALIZE;
+	__sync_synchronize(); /* st-st barrier */
+	progress_flag_down = 1;
+	pthread_cond_signal(&progress_cond_down);
+	pthread_mutex_unlock(&progress_mutex);
+
+	/* Make sure the following command will observe INIT */
+	while (!progress_flag_up) {
+	}
+	progress_flag_up = 0;
+
+	pthread_join(progress_thr, NULL);
+
+	if ((ret = MPI_Comm_free(&progress_comm)) != MPI_SUCCESS) {
+		pr_err("%s: error: MPI_Comm_free failed (%d)\n", __func__, ret);
+		return;
+	}
+
+	progress_refc = 0;
+
+#ifdef PROFILE
+	end = rdtsc_light();
+	RECORD_STAT(cyc_finalize_count, cyc_finalize, end, start);
+
+	for (j = 0; j < NRANK_STAT; j++) {
+
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		if (j != progress_world_rank) {
+			usleep(1000000);
+			continue;
+		}
+
+		pr_stat("cyc_prog1", cyc_prog1_count, cyc_prog1);
+		pr_stat("cyc_prog2", cyc_prog2_count, cyc_prog2);
+		pr_stat("cyc_init1", cyc_init1_count, cyc_init1);
+		pr_stat("cyc_init2", cyc_init2_count, cyc_init2);
+		pr_stat("cyc_start", cyc_start_count, cyc_start);
+		pr_stat("cyc_stop1", cyc_stop1_count, cyc_stop1);
+		pr_stat("cyc_stop2", cyc_stop2_count, cyc_stop2);
+		pr_stat("cyc_stop3", cyc_stop3_count, cyc_stop3);
+		pr_stat("cyc_finalize", cyc_finalize_count, cyc_finalize);
+	}
+#endif
+}
diff --git a/test/uti/mpi/async_progress.h b/test/uti/mpi/async_progress.h
new file mode 100644
index 00000000..bd0d39a2
--- /dev/null
+++ b/test/uti/mpi/async_progress.h
@@ -0,0 +1,15 @@
+#ifndef _ASYNC_PROGRESS_INCLUDED_
+#define _ASYNC_PROGRESS_INCLUDED_
+
+enum progress_state {
+	PROGRESS_INIT = 0,
+	PROGRESS_START,
+	PROGRESS_FINALIZE
+};
+
+void progress_init();
+void progress_start();
+void progress_stop();
+void progress_finalize();
+
+#endif
diff --git a/test/uti/mpi/env_intel.sh b/test/uti/mpi/env_intel.sh
new file mode 100644
index 00000000..5455e20f
--- /dev/null
+++ b/test/uti/mpi/env_intel.sh
@@ -0,0 +1,17 @@
+export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh
+export HYDRA_BOOTSTRAP=rsh
+export HYDRA_PROXY_RETRY_COUNT=30
+
+#export HYDRA_BRANCH_COUNT=4
+
+export I_MPI_PIN=off
+export HFI_NO_CPUAFFINITY=1
+export KMP_AFFINITY=granularity=thread,scatter
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
diff --git a/test/uti/mpi/env_mpich.sh b/test/uti/mpi/env_mpich.sh
new file mode 100644
index 00000000..e61e71d8
--- /dev/null
+++ b/test/uti/mpi/env_mpich.sh
@@ -0,0 +1,5 @@
+export HYDRA_BOOTSTRAP_EXEC=/bin/pjrsh
+export HYDRA_BOOTSTRAP=rsh
+export HYDRA_PROXY_RETRY_COUNT=30
+export MPIR_CVAR_OFI_USE_PROVIDER=psm2
+
diff --git a/test/uti/mpi/filter.pl b/test/uti/mpi/filter.pl
new file mode 100755
index 00000000..e61c66ef
--- /dev/null
+++ b/test/uti/mpi/filter.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl
+
+while(<>) { # For each line of hostfile 
+    open();
+    $found = 0;
+    while(<>) {
+        if($_ =~ /progress_fn,enter,tid=(\d+)/) {
+            $tid = $1;
+            $found = 1;
+            #	print 'tid='.$tid."\n"
+        }
+        if($found == 1 && $_ =~ /^$tid/) {
+            if($_ =~ /^$tid\s(\w+)/) {
+                #	    print $1."\n";
+                $freq{$1}{$hostname}++;
+            }
+        }
+    }
+}
+foreach $key (sort(keys(%freq))) {
+    print $key.",".$freq{$key}."\n";
+}
diff --git a/test/uti/mpi/mpi_progress_thread.pl b/test/uti/mpi/mpi_progress_thread.pl
new file mode 100755
index 00000000..273f1d84
--- /dev/null
+++ b/test/uti/mpi/mpi_progress_thread.pl
@@ -0,0 +1,100 @@
+#!/usr/bin/perl
+
+# Usage ./mpi_progress.pl <#procs> <#nnodes> (mck|lin) (mpich|intel)
+
+use File::Basename;
+use File::Copy "cp";
+
+($nprocs, $nnodes, $os, $mpi) = @ARGV;
+$ppn = $nprocs / $nnodes;
+
+@command = split /\s+/, basename($0);
+@fn = split /\./, $command[0];
+
+if($nnodes <= 16) {
+    $rg = 'MCK-FLAT-QUADRANT';
+} elsif($ARGV[1] <= 128) {
+    $rg = 'debug-flat';
+} else {
+    $rg = 'regular-flat';
+}
+
+%elapse = (
+'1', '00:10:00',
+'2', '00:10:00',
+'4', '00:10:00',
+'8', '00:10:00',
+'16', '00:10:00',
+'32', '00:10:00',
+'64', '00:05:00',
+'128', '00:05:00',
+'256', '00:10:00',
+'512', '00:15:00',
+'1024', '00:15:00',
+'2048', '00:30:00',
+    );
+
+if ($os eq 'lin') {
+    $use_mck =  '';
+    $mck_mem = '';
+    $mcexec = '';
+    $mcexecopt = '';
+} else {
+    $path_to_mck = '/work/gg10/e29005/project/os/install';
+    $use_mck = '#PJM -x MCK='.$path_to_mck;
+    $mck_mem = '#PJM -x MCK_MEM=32G@0,8G@1';
+    $mcexec = $path_to_mck.'/bin/mcexec';
+    $mcexecopt = '-n '.$ppn;
+}
+
+if ($mpi eq 'intel') {
+    $cc = 'mpiicc';
+    $mpiexec = 'mpiexec';
+    $genv = '';
+    $progress = '-genv I_MPI_ASYNC_PROGRESS 1'; # -genv I_MPI_ASYNC_PROGRESS_PIN 1
+} else {
+    $mpi_lib = '/work/gg10/e29005/project/mpich/install';
+    $cc = $mpi_lib.'/bin/mpicc';
+    $mpiexec = $mpi_lib.'/bin/mpiexec';
+    $genv = '-genv LD_LIBRARY_PATH '.$mpi_lib.'/lib:$LD_LIBRARY_PATH';
+    $progress = '-genv MPIR_CVAR_ASYNC_PROGRESS 1';
+}
+
+system("make clean; make CC=$cc");
+
+$dir=$ARGV[2].'_'.$ARGV[0].'_'.$ARGV[1].'_'.`date +%Y%m%d_%H%M%S`;
+chomp($dir);
+print 'less '.$dir.'/job.sh.o*'."\n";
+
+mkdir $dir;
+chdir $dir;
+cp('../001', './001') or die 'copy failed';
+open(IN, "../$fn[0].sh.in");
+open(OUT, ">./job.sh");
+while(<IN>) {
+    s/\@rg@/$rg/g;
+    s/\@nnodes@/$nnodes/g;
+    s/\@nprocs@/$nprocs/g;
+    s/\@elapse@/$elapse{$nnodes}/g;
+    s/\@use_mck@/$use_mck/g;
+    s/\@mck_mem@/$mck_mem/g;
+    s/\@progress@/$progress/g;
+    s/\@genv@/$genv/g;
+    s/\@mpiexec@/$mpiexec/g;
+    s/\@mcexec@/$mcexec/g;
+    s/\@mcexecopt@/$mcexecopt/g;
+    if(/\@env@/) {
+	open(INCL, "../env_$mpi.sh");
+	while(my $line = <INCL>) {
+	    print OUT $line;
+	}
+	next;
+    }
+    print OUT $_;
+}
+close(IN);
+close(OUT);
+
+$cmd = 'PJM_MCK_AVAILABLE=1 pjsub ./job.sh';
+#print $cmd."\n";
+exec($cmd);
diff --git a/test/uti/mpi/mpi_progress_thread.sh.in b/test/uti/mpi/mpi_progress_thread.sh.in
new file mode 100644
index 00000000..f149a1f8
--- /dev/null
+++ b/test/uti/mpi/mpi_progress_thread.sh.in
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+#PJM -L rscgrp=@rg@
+#PJM -L node=@nnodes@
+#PJM --mpi proc=@nprocs@
+#PJM -L elapse=@elapse@
+#PJM -L proc-crproc=16384 
+#PJM -g gg10
+#PJM -j
+#PJM -s
+@use_mck@
+@mck_mem@
+
+@env@
+
+@mpiexec@ @genv@ @progress@ -np @nprocs@ -machinefile ${PJM_O_NODEINF} @mcexec@ @mcexecopt@ ./001 1048576 1000
diff --git a/test/uti/mpi/util.c b/test/uti/mpi/util.c
new file mode 100644
index 00000000..cdf51140
--- /dev/null
+++ b/test/uti/mpi/util.c
@@ -0,0 +1,186 @@
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <time.h>
+#include "util.h"
+
+/* Messaging */
+enum test_loglevel test_loglevel = TEST_LOGLEVEL_DEBUG;
+
+/* Calculation */
+static inline void asmloop(unsigned long n) {
+	int j;
+
+	for (j = 0; j < n; j++) {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+	} 
+}
+
+#define N_INIT 10000000
+double nspw; /* nsec per work */
+
+void ndelay_init(int verbose) {
+	struct timeval start, end;
+
+	//clock_gettime(TIMER_KIND, &start);
+	gettimeofday(&start, NULL);
+
+#pragma omp parallel
+	{
+		asmloop(N_INIT);
+	}
+
+	//clock_gettime(TIMER_KIND, &end);
+	gettimeofday(&end, NULL);
+
+	nspw = DIFFUSEC(end, start) * 1000 / (double)N_INIT;
+	if (verbose) {
+		pr_debug("nspw=%f\n", nspw);
+	}
+}
+
+#if 1
+void ndelay(long delay_nsec) {
+	if (delay_nsec < 0) { 
+		printf("delay_nsec < 0\n");
+		return;
+	}
+#pragma omp parallel
+	{
+		asmloop(delay_nsec / nspw);
+	}
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void ndelay(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(TIMER_KIND, &start);
+
+	while (1) {
+		clock_gettime(TIMER_KIND, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		asmloop(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+double cycpw; /* cyc per work */
+
+void cdlay_init() {
+	unsigned long start, end;
+
+	start = rdtsc_light();
+#define N_INIT 10000000
+	asmloop(N_INIT);
+	end = rdtsc_light();
+	cycpw = (end - start) / (double)N_INIT;
+}
+
+#if 0
+void cdelay(long delay_cyc) {
+	if (delay_cyc < 0) { 
+		return;
+	}
+	asmloop(delay_cyc / cycpw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void cdelay(long delay_cyc) {
+	unsigned long start, end;
+	
+	if (delay_cyc < 0) { return; }
+	start = rdtsc_light();
+
+	while (1) {
+		end = rdtsc_light();
+		if (end - start >= delay_cyc) {
+			break;
+		}
+		asmloop(2);
+	}
+}
+#endif
+
+
+int print_cpu_last_executed_on(const char *name) {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+	int rc;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getpu() failed\n");
+		goto fn_fail;
+	}
+
+	rc = syscall(732);
+	
+	printf("%s: pmi_rank=%02d,os=%s,stat-cpu=%02d,sched_getcpu=%02d,tid=%d\n", name, atoi(getenv("PMI_RANK")), rc == -1 ? "lin" : "mck", atoi(field), cpu, tid); fflush(stdout);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
diff --git a/test/uti/mpi/util.h b/test/uti/mpi/util.h
new file mode 100644
index 00000000..3482aae3
--- /dev/null
+++ b/test/uti/mpi/util.h
@@ -0,0 +1,73 @@
+#ifndef __UTIL_H_INCLUDED__
+#define __UTIL_H_INCLUDED__
+
+#include <stdint.h>
+
+/* Messaging */
+
+enum test_loglevel {
+	TEST_LOGLEVEL_ERR = 0,
+	TEST_LOGLEVEL_WARN,
+	TEST_LOGLEVEL_DEBUG
+};
+
+extern enum test_loglevel test_loglevel;
+static inline void test_set_loglevel(enum test_loglevel level)
+{
+	test_loglevel = level;
+}
+
+#define pr_level(level, fmt, args...) do {	\
+	if (test_loglevel >= level) {	\
+		fprintf(stdout, fmt, ##args);	\
+	}					\
+} while (0)
+
+#define pr_err(fmt, args...) pr_level(TEST_LOGLEVEL_ERR, fmt, ##args)
+#define pr_warn(fmt, args...) pr_level(TEST_LOGLEVEL_WARN, fmt, ##args)
+#define pr_debug(fmt, args...) pr_level(TEST_LOGLEVEL_DEBUG, fmt, ##args)
+
+#define _OKNG(verb, jump, cond, fmt, args...) do {	\
+	if (cond) {					\
+		if (verb)				\
+			printf("[ OK ] " fmt, ##args);	\
+	} else {					\
+		printf("[ NG ] " fmt, ##args);		\
+		if (jump) {				\
+			ret = -1;			\
+			goto out;			\
+		}					\
+	}						\
+} while (0)
+
+#define OKNG(args...) _OKNG(1, 1, ##args)
+#define NG(args...) _OKNG(0, 1, ##args)
+#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args)
+
+/* Time */
+inline uint64_t rdtsc_light(void)
+{
+    uint64_t x;
+    __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */
+                         "shl $32, %%rdx;"
+                         "or %%rdx, %%rax" :
+                         "=a"(x) :
+                         :    
+                         "%rcx", "%rdx", "memory");
+    return x;
+}
+
+#define DIFFUSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000UL + (end.tv_usec - start.tv_usec))
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */
+
+/* Calculation emulation */
+void ndelay_init();
+void ndelay(long delay_nsec);
+void cdelay_init();
+void cdelay(long delay_cyc);
+
+/* CPU location */
+int print_cpu_last_executed_on();
+
+#endif
diff --git a/test/uti/posix_aio/001.c b/test/uti/posix_aio/001.c
new file mode 100644
index 00000000..2e09b9cf
--- /dev/null
+++ b/test/uti/posix_aio/001.c
@@ -0,0 +1,517 @@
+#include <fcntl.h>
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <aio.h>
+#include <signal.h>
+#include <libgen.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <mpi.h>
+#include <linux/limits.h>
+#include "util.h"
+
+#define NREQS 1 /* # of parallel I/O requests per process */
+#define SZBUF (1ULL<<23)
+
+#define MYTIME_TOUSEC 1000000
+#define MYTIME_TONSEC 1000000000
+
+#define NROW 11
+#define NCOL 4
+
+#define NSAMPLES_DROP 0/*10*/
+#define NSAMPLES_IO 2/*20*/
+#define NSAMPLES_TOTAL 2/*20*/
+#define NSAMPLES_INNER 1
+
+#define Q(x) #x
+#define QUOTE(x) Q(x)
+
+char test_srcdir[PATH_MAX];
+
+static inline double mytime() {
+	return /*rdtsc_light()*/MPI_Wtime();
+}
+
+struct aioreq {
+	int rank;
+	int status;
+	struct aiocb *aiocbp;
+};
+
+static void aio_sighandler(int sig, siginfo_t *si, void *ucontext)
+{
+	if (si->si_code == SI_ASYNCIO) {
+		//struct aioreq *aioreq = si->si_value.sival_ptr;
+		//pr_debug("I/O completion signal received\n");
+	}
+}
+
+int my_aio_init(int nreqs, struct aioreq *iolist, struct aiocb *aiocblist, char *aiobufs[NREQS]) {
+	int j;
+	
+	for (j = 0; j < nreqs; j++) {
+		iolist[j].rank = j;
+		iolist[j].aiocbp = &aiocblist[j];
+		iolist[j].aiocbp->aio_buf = aiobufs[j];
+		iolist[j].aiocbp->aio_nbytes = SZBUF;
+		iolist[j].aiocbp->aio_reqprio = 0;
+		iolist[j].aiocbp->aio_offset = 0;
+		iolist[j].aiocbp->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+		iolist[j].aiocbp->aio_sigevent.sigev_signo = SIGUSR1;
+		iolist[j].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[j];
+	}
+
+	return 0;
+}
+
+
+int my_aio_evict(int nreqs, char **fn) {
+	int ret;
+	int i;
+	char cmd[PATH_MAX];
+	
+	for (i = 0; i < NREQS; i++) {
+
+		sprintf(cmd, "%s -e %s > /dev/null", QUOTE(VMTOUCH), fn[i]);
+		ret = system(cmd);
+
+		if (ret == -1) {
+			pr_err("%s: error: system\n",
+                               __func__);
+                        goto out;
+		}
+
+		if (WEXITSTATUS(ret)) {
+			pr_err("%s: error: system returned %d\n",
+                               __func__, WEXITSTATUS(ret));
+			ret = WEXITSTATUS(ret);
+                        goto out;
+		}
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+int my_aio_open(int nreqs, struct aioreq *iolist, char **fn) {
+	int ret;
+	int j;
+	
+	for (j = 0; j < NREQS; j++) {
+		iolist[j].aiocbp->aio_fildes = open(fn[j], O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH);
+		if (iolist[j].aiocbp->aio_fildes == -1) {
+			pr_err("%s: error: open %s: %s\n",
+			       __func__, fn[j], strerror(errno));
+			ret = 1;
+			goto out;
+		}
+	}
+
+	ret = 0;
+ out:
+	return ret;
+}
+
+int my_aio_check(int nreqs, char **fn, char **mem_data) {
+	int ret;
+	int i;
+	FILE *fp[NREQS] = { 0 };
+	char *file_data[NREQS];
+
+	/* Check contents */
+	for (i = 0; i < nreqs; i++) {
+
+		if (!(file_data[i] = malloc(SZBUF))) {
+			pr_err("error: allocating data\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (!(fp[i] = fopen(fn[i], "r+"))) {
+			pr_err("error: fopen %s: %s\n",
+			       fn[i], strerror(errno));
+			ret = -errno;
+			goto out;
+		}
+		
+		if (fread(file_data[i], sizeof(char), SZBUF, fp[i]) != SZBUF) {
+			pr_err("error: fread: %s\n",
+			       strerror(errno));
+			ret = -1;
+			goto out;
+		}
+
+		fclose(fp[i]);
+
+		if (memcmp((const char *)file_data[i], mem_data[i], SZBUF)) {
+			pr_err("%s: file_data[%d] and mem_data[%d] doesn't match\n",
+			       __func__, i, i);
+			ret = -1;
+			goto out;
+		}
+
+		free(file_data[i]);
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+void my_aio_close(int nreqs, struct aioreq *iolist) {
+	int j;
+	
+	for (j = 0; j < NREQS; j++) {
+		close(iolist[j].aiocbp->aio_fildes);
+		iolist[j].aiocbp->aio_fildes = -1;
+	}
+}
+
+int my_aio(int nreqs, struct aioreq *iolist, char **fn, long nsec_calc) {
+	int ret;
+	int i, j;
+
+	/* Start async IO */
+	for (j = 0; j < NSAMPLES_INNER; j++) {
+		int completion_count = 0;
+
+		//pr_debug("debug: opening file\n");
+		if ((ret = my_aio_open(nreqs, iolist, fn)) == -1) {
+			pr_err("%s: error: aio_read: %s\n",
+			       __func__, strerror(errno));
+			ret = -errno;
+			goto out;
+		}
+
+		//pr_debug("debug: issuing write command\n");
+		for (j = 0; j < nreqs; j++) {
+
+			/* Reset completion notice */
+			iolist[j].status = EINPROGRESS;
+
+			if ((ret = aio_write(iolist[j].aiocbp)) == -1) {
+				pr_err("%s: error: aio_read: %s\n",
+				       __func__, strerror(errno));
+				ret = -errno;
+				goto out;
+			}
+		}
+
+		/* Emulate calcuation phase */
+		ndelay(nsec_calc);
+		
+		/* Wait for completion of async IO */
+		//pr_debug("debug: waiting for completion\n");
+		while (completion_count != nreqs) {
+			for (j = 0; j < nreqs; j++) {
+				if (iolist[j].status != EINPROGRESS) {
+					continue;
+				}
+				
+				iolist[j].status = aio_error(iolist[j].aiocbp);
+				
+				switch (iolist[j].status) {
+				case 0: /* Succeeded */
+					goto completed;
+				case EINPROGRESS:
+					break;
+				case ECANCELED:
+					pr_err("%s: error: aio is cancelled\n",
+					       __func__);
+					goto completed;
+				default:
+					pr_err("%s: error: unexpected status: %d\n",
+					       __func__, iolist[j].status);
+					goto completed;
+				completed:
+					completion_count++;
+					break;
+				}
+			}
+		}
+		
+		/* Check write amount */
+		for (j = 0; j < nreqs; j++) {
+			ssize_t size;
+			
+			if ((size = aio_return(iolist[j].aiocbp)) != SZBUF) {
+				pr_err("%s: Expected to have written %ld B but reported to have written %ld B\n",
+				       __func__, SZBUF, size);
+				ret = -1;
+				goto out;
+			}
+		}
+
+		my_aio_close(nreqs, iolist);
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+int measure(double *result, int nsamples, int nsamples_drop, int nreqs, struct aioreq *iolist, char **fn, char **aiobufs, long nsec_calc) {
+	int ret;
+	int i;
+	double t_l, t_g, t_sum = 0;
+	double start, end;
+
+	for (i = 0; i < nsamples + nsamples_drop; i++) {
+
+#if 0
+		pr_debug("debug: evicting file cache\n");
+		if ((ret = my_aio_evict(nreqs, fn))) {
+			pr_err("%s: error: my_aio_evict returned %d\n",
+			       __func__, ret);
+		}
+#endif
+		MPI_Barrier(MPI_COMM_WORLD);
+		
+		start = mytime();
+		if ((ret = my_aio(nreqs, iolist, fn, nsec_calc))) {
+			pr_err("%s: error: my_aio_read returned %d\n",
+			       __func__, ret);
+		}
+		end = mytime();
+		
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		/* Check contents */
+		if ((ret = my_aio_check(nreqs, fn, aiobufs))) {
+			pr_err("%s: error: my_aio_check returned %d\n",
+			       __func__, ret);
+		}
+
+		if (i < nsamples_drop) {
+			continue;
+		}
+
+		/* Take max */
+		t_l = end - start;
+		MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+		t_sum += t_g;
+	}
+
+	*result = t_sum / nsamples;
+	ret = 0;
+ out:
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	int i, j, progress, l;
+	int rank, nproc;
+	int disable_syscall_intercept = 0, ppn = -1;
+	struct aioreq *iolist;
+	struct aiocb *aiocblist;
+	struct sigaction sa;
+	double t_io_ave, t_total_ave;
+	double t_table[NROW][NCOL] = { 0 };
+	int opt;
+	char *aiobufs[NREQS] = { 0 };
+	char **fn;
+
+	opterr = 0; /* Don't print out error when not recognizing option character */
+	
+	while ((opt = getopt(argc, argv, ":I:p:")) != -1) {
+		switch (opt) {
+		case 'I':
+			disable_syscall_intercept = atoi(optarg);
+			break;
+		case 'p':
+			ppn = atoi(optarg);
+			break;
+		case '?':
+			pr_err("error: invalid option: -%c\n",
+			       optopt);
+			ret = 1;
+			goto out;
+		case ':':
+			pr_err("error: option -%c requires an argument\n",
+			       optopt);
+			ret = 1;
+			goto out;
+		}
+	}
+
+	if (ppn == -1) {
+		pr_err("error: specify -p <PPN>\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Initialize MPI */
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	/* Show parameters */
+	if (rank == 0) {
+#pragma omp parallel
+		{
+			if (omp_get_thread_num() == 0) {
+				printf("nproc: %d, ppn: %d, #threads: %d\n", nproc, ppn, omp_get_num_threads());
+			}
+		}
+	}
+
+	/* Set verbosity */
+	//test_set_loglevel(TEST_LOGLEVEL_WARN);	
+
+	/* Initialize delay function */
+	ndelay_init();
+
+	/* Prepare file names */
+
+#define TEST_SRCDIR "/work/gg10/e29005"
+	sprintf(test_srcdir, "%s", /*TEST_SRCDIR*/dirname(argv[0]));
+
+	if (!(fn = malloc(sizeof(char *) * NREQS))) {
+		pr_err("error: allocating fn\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < NREQS; i++) {
+		if (!(fn[i] = malloc(PATH_MAX))) {
+			pr_err("error: allocating fn\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		sprintf(fn[i], "%s/rank%d-number%d", test_srcdir, rank, i);
+		if (rank == 0) pr_debug("debug: rank: %d, fn[%d]: %s\n",
+			 rank, i, fn[i]);
+	}
+
+	/* Allocate aio commands */
+	if (!(iolist = calloc(NREQS, sizeof(struct aioreq)))) {
+		pr_err("%s: error: allocating iolist\n",
+		       __func__);
+		ret = 1;
+		goto out;
+	}
+
+	if (!(aiocblist = calloc(NREQS, sizeof(struct aiocb)))) {
+		pr_err("%s: error: allocating aiocblist\n",
+		       __func__);
+		ret = 1;
+		goto out;
+	}
+
+	/* Prepare contents to be written */
+	for (i = 0; i < NREQS; i++) {
+		aiobufs[i] = malloc(SZBUF);
+		if (!aiobufs[i]) {
+			pr_err("%s: error: allocating aiobufs\n",
+			       __func__);
+			ret = 1;
+			goto out;
+		}
+
+		for (j = 0; j < SZBUF; j++) {
+			*(aiobufs[i] + j) = i + j + rank;
+		}
+	}
+
+	/* Set signal handlers */
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sa.sa_sigaction = aio_sighandler;
+	if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+		pr_err("%s: error: sigaction: %s\n",
+		       __func__, strerror(errno));
+		ret = 1;
+		goto out;
+	}
+
+	/* Set aio parameters except fd and status */
+	if ((ret = my_aio_init(NREQS, iolist, aiocblist, aiobufs))) {
+		pr_err("%s: error: my_aio_init returned %d\n",
+		       __func__, ret);
+		goto out;
+	}
+
+	/* Measure IO only time */
+	//pr_debug("debug: measuring IO only time\n");
+	if ((ret = measure(&t_io_ave, NSAMPLES_IO, NSAMPLES_DROP, NREQS, iolist, fn, aiobufs, 0))) {
+		pr_err("error: measure returned %d\n", ret);
+		goto out;
+	}
+
+	if (rank == 0) {
+		printf("t_io_ave: %.0f usec, %.0f MB/s per node\n",
+		       t_io_ave * MYTIME_TOUSEC,
+		       SZBUF * ppn / t_io_ave / 1000000);
+	}
+
+	/* Measure time with no progress, progress and no uti, progress and uti */
+	for (progress = 0; progress <= (disable_syscall_intercept ? 0 : 0); progress += 1) {
+
+		/* Spawn helper thread onto compute CPUs with ignoring uti_attr */
+		if (progress == 1) {
+			setenv("DISABLE_UTI", "1", 1); 
+		}
+		/* Spawn helper thread onto dedicated CPUs with respecting uti_attr */
+		else if (progress == 2) {
+			unsetenv("DISABLE_UTI");
+		}
+
+		/* Measure with various calculation time */
+		for (l = 0; l <= 10; l += 2) {
+			long nsec_calc = (t_io_ave * MYTIME_TONSEC * l) / 10;
+			
+			if ((ret = measure(&t_total_ave, NSAMPLES_TOTAL, NSAMPLES_DROP, NREQS, iolist, fn, aiobufs, nsec_calc))) {
+				pr_err("error: measure returned %d\n", ret);
+				goto out;
+			}
+
+			if (rank == 0) {
+				if (l == 0) {
+					pr_debug("progress=%d\n", progress);
+					if (progress == 0) { 
+						pr_debug("calc\ttotal\n");
+					} else {
+						pr_debug("total\n");
+					}
+				}
+
+				t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC);
+				if (progress == 0) { 
+					pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				} else {
+					pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				}
+			}
+		}
+	}
+
+	if (rank == 0) {
+		printf("calc,no prog,prog and no uti, prog and uti\n");
+		for (l = 0; l <= 10; l++) {
+			for (i = 0; i < NCOL; i++) {
+				if (i > 0) {
+					printf(",");
+				}
+				printf("%.0f", t_table[l][i]);
+			}
+			printf("\n");
+		}
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+
+	MPI_Finalize();
+
+	ret = 0;
+out:
+	for (i = 0; i < NREQS; i++) {
+		free(aiobufs[i]);
+	}
+	return ret;
+}
diff --git a/test/uti/posix_aio/001.sh b/test/uti/posix_aio/001.sh
new file mode 100755
index 00000000..6d3289d8
--- /dev/null
+++ b/test/uti/posix_aio/001.sh
@@ -0,0 +1,270 @@
+#!/usr/bin/bash
+
+#!/usr/bin/bash -x
+
+MYHOME=/home/e29005
+test_dir=`pwd -P`
+mck_dir=${MYHOME}/project/os/install
+uti_dir_lin=${MYHOME}/project/uti/install_linux
+uti_dir_mck=${MYHOME}/project/uti/install_mckernel
+
+exe=`basename $0 | sed 's/\.sh//'`
+
+stop=0
+reboot=0
+go=0
+
+interactive=0
+pjsub=0
+gdb=0
+disable_syscall_intercept=0
+mck=0
+nnodes=2
+LASTNODE=8196
+use_hfi=0
+omp_num_threads=1
+ppn=4
+
+while getopts srgc:ml:N:P:o:hGI:ipL: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+            m) mck=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+	    h) use_hfi=1
+		;;
+	    G) gdb=1
+		;;
+	    I) disable_syscall_intercept=$OPTARG
+		;;
+	    i) interactive=1
+		;;
+	    p) pjsub=1
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+nprocs=$((ppn * nnodes))
+nodes=`echo $(seq -s ",c" $(($LASTNODE + 1 - $nnodes)) $LASTNODE) | sed 's/^/c/'`
+
+# vertical cut, excluding phys loaded with Linux tasks
+uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223
+exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223
+#64-67,132-135,200-203,268-271 
+
+uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223
+
+# horizontal cut, excluding phys loaded with Linux tasks for mckernel
+#uti_cpu_set_lin=204-271 
+#uti_cpu_set_mck=1-67
+
+if [ $mck -eq 0 ]; then
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin"
+    i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list"
+else
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck"
+    i_mpi_pin_processor_exclude_list=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+#    if [ $omp_num_threads -eq 1 ]; then
+#	# Avoid binding main thread and uti thread to one CPU
+	kmp_affinity="export KMP_AFFINITY=disabled" 
+#    else
+#	# Bind rank to OMP_NUM_THREAD-sized CPU-domain
+#	kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+#    fi
+else
+    i_mpi_pin=on
+    domain=$omp_num_threads # Use 32 when you want to match mck's -n division
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+    kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+fi
+
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes omp_num_threads=$omp_num_threads
+
+if [ ${mck} -eq 1 ]; then
+    makeopt="UTI_DIR=$uti_dir_mck"
+    use_mck="#PJM -x MCK=$mck_dir"
+    mck_mem="#PJM -x MCK_MEM=32G@0,8G@1"
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 4))
+    mcexecopt="-n $ppn --uti-use-last-cpu" # -t $nmcexecthr
+
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+
+    if [ $disable_syscall_intercept -eq 0 ]; then
+	mcexecopt="--enable-uti $mcexecopt"
+    fi
+
+else
+    offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu \| grep Off 2>&1 | dshbak -c | grep Off`
+    if [ "$offline" != "" ]; then
+	echo "Error: Some CPUs are offline: $offline"
+	exit
+    fi
+
+    makeopt="UTI_DIR=$uti_dir_lin"
+    use_mck=
+    mck_mem=
+    mcexec=
+    mcexecopt=
+fi
+
+if [ $gdb -eq 1 ]; then
+    enable_x="-enable-x"
+    gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args"
+fi
+
+if [ $interactive -eq 1 ]; then
+    i_mpi_hydra_bootstrap_exec=
+    i_mpi_hydra_bootstrap=
+    hosts=
+    opt_dir=/opt/intel
+    ssh=
+else
+#    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
+    i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh"
+    i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh"
+    hosts="-hosts $nodes"
+    opt_dir=/home/opt/local/cores/intel
+    ssh="ssh -A c$LASTNODE"
+fi
+
+# If using ssh
+# Latest versions are: 1.163, 2.199, 3.222
+if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then
+    compilervars=". ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64"
+else
+    compilervars=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof mcexec \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	if hostname  | grep ofp &>/dev/null; then
+
+	    # -h: Hide idle thread to prevent KNL CPU from mux-ing resource and halving throughput 
+	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+
+# perl -e 'for ($i=0;$i<68;$i++){if($i>0){print "+";}printf("%d,%d,%d:%d", $i+68,$i+136,$i+204,$i);}'
+
+#	    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+#	    sudo ${mck_dir}/sbin/mcreboot.sh -O -c 68-271 -r 68,136,204:0+69,137,205:1+70,138,206:2+71,139,207:3+72,140,208:4+73,141,209:5+74,142,210:6+75,143,211:7+76,144,212:8+77,145,213:9+78,146,214:10+79,147,215:11+80,148,216:12+81,149,217:13+82,150,218:14+83,151,219:15+84,152,220:16+85,153,221:17+86,154,222:18+87,155,223:19+88,156,224:20+89,157,225:21+90,158,226:22+91,159,227:23+92,160,228:24+93,161,229:25+94,162,230:26+95,163,231:27+96,164,232:28+97,165,233:29+98,166,234:30+99,167,235:31+100,168,236:32+101,169,237:33+102,170,238:34+103,171,239:35+104,172,240:36+105,173,241:37+106,174,242:38+107,175,243:39+108,176,244:40+109,177,245:41+110,178,246:42+111,179,247:43+112,180,248:44+113,181,249:45+114,182,250:46+115,183,251:47+116,184,252:48+117,185,253:49+118,186,254:50+119,187,255:51+120,188,256:52+121,189,257:53+122,190,258:54+123,191,259:55+124,192,260:56+125,193,261:57+126,194,262:58+127,195,263:59+128,196,264:60+129,197,265:61+130,198,266:62+131,199,267:63+132,200,268:64+133,201,269:65+134,202,270:66+135,203,271:67 -m 32G@0,12G@1
+	else
+	    echo "unkwon host type"
+	    exit 1
+	fi
+    else
+	:
+    fi
+fi
+
+(
+cat <<EOF
+#!/bin/sh
+
+#PJM -L rscgrp=$rg
+#PJM -L node=$nnodes
+#PJM --mpi proc=$nprocs
+#PJM -L elapse=$elapse
+#PJM -L proc-crproc=16384 
+#PJM -g gg10
+#PJM -j
+#PJM -s
+$use_mck
+$mck_mem
+
+$i_mpi_hydra_bootstrap_exec
+$i_mpi_hydra_bootstrap
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+export PSM2_RCVTHREAD=0
+
+$uti_cpu_set_str
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_processor_exclude_list
+$i_mpi_pin_domain
+$i_mpi_pin_order
+$kmp_affinity
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+export I_MPI_FABRICS=shm:tmi
+export PSM2_RCVTHREAD=0
+export I_MPI_TMI_PROVIDER=psm2
+export I_MPI_FALLBACK=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=off
+
+#export I_MPI_STATS=native:20,ipm
+#export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+ulimit -c unlimited 
+
+$compilervars
+mpiexec.hydra -n $nprocs -ppn $ppn $hosts $ilpopt $enable_x $gdbcmd $mcexec $mcexecopt ${test_dir}/$exe -I $disable_syscall_intercept -p $ppn
+#-l
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    if [ $pjsub -eq 1 ]; then
+	pjsub ./job.sh
+    else
+	if [ $interactive -eq 0 ]; then
+	    . ${opt_dir}/compilers_and_libraries_2018.2.199/linux/bin/compilervars.sh intel64
+	fi
+	#rm ./$exe
+	make $makeopt ./$exe
+
+	$ssh ${test_dir}/job.sh
+    fi
+fi
diff --git a/test/uti/posix_aio/002.c b/test/uti/posix_aio/002.c
new file mode 100644
index 00000000..f36ee18a
--- /dev/null
+++ b/test/uti/posix_aio/002.c
@@ -0,0 +1,658 @@
+#define _GNU_SOURCE
+#include <limits.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <aio.h>
+#include <signal.h>
+#include <libgen.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <fcntl.h>
+#include <mpi.h>
+#include <linux/limits.h>
+#include <semaphore.h>
+#include "util.h"
+
+#define SZBUF (1ULL << 23)/*23*/
+
+#define MYTIME_TOUSEC 1000000
+#define MYTIME_TONSEC 1000000000
+
+#define NROW 16
+#define NCOL 4
+
+#define NSAMPLES_PROFILE 3
+#define NSAMPLES_DROP 1/*10*/
+#define NSAMPLES_IO 5/*20*/
+#define NSAMPLES_TOTAL 5/*20*/
+#define NSAMPLES_INNER 1
+
+#define WAIT_TYPE_BUSY_LOOP 0
+#define WAIT_TYPE_SEM 1
+#define WAIT_TYPE WAIT_TYPE_SEM
+
+static sem_t aio_sem;
+volatile int completion_count;
+
+static inline double mytime() {
+	return /*rdtsc_light()*/MPI_Wtime();
+}
+
+struct aioreq {
+	int rank, aio_num_threads;
+	int status;
+	struct aiocb *aiocbp;
+};
+
+static void aio_handler(sigval_t sigval)
+{
+	struct aioreq *aioreq = sigval.sival_ptr;
+	int ret;
+
+	//pr_debug("%s: debug: rank=%d\n", __func__, aioreq->rank);
+	ret = __sync_add_and_fetch(&completion_count, 1);
+	if (ret == aioreq->aio_num_threads) {
+		if (sem_post(&aio_sem)) {
+			pr_err("%s: error: sem_post: %s\n",
+			       __func__, strerror(errno));
+		}
+	}
+
+	//pr_debug("%s: debug: completion_count: %d\n", __func__, ret);
+}
+
+static void aio_sighandler(int sig, siginfo_t *si, void *ucontext)
+{
+	pr_debug("%s: debug: enter\n", __func__);
+#if WAIT_TYPE == WAIT_TYPE_SEM
+	struct aioreq *aioreq = si->si_value.sival_ptr;
+
+	if (si->si_code != SI_ASYNCIO) {
+		pr_err("%s: error: unexpected si_code: %d\n",
+	       __func__, si->si_code);
+	}
+	
+	aioreq->status = aio_error(aioreq->aiocbp);
+	if (aioreq->status != 0) {
+		pr_err("%s: error: unexpected status: %d\n",
+	       __func__, aioreq->status);
+	}
+
+	if (__sync_add_and_fetch(&completion_count, 1) == aioreq->aio_num_threads) {
+		if (sem_post(&aio_sem)) {
+			pr_err("%s: error: sem_post: %s\n",
+			       __func__, strerror(errno));
+		}
+	}
+
+	//pr_debug("%s: debug: completion_count: %d\n", __func__, completion_count);
+#endif /* WAIT_TYPE */
+}
+
+int my_aio_init(int nreqs, struct aioreq *iolist, struct aiocb *aiocblist, char **aiobufs) {
+	int ret;
+	int i;
+	
+	for (i = 0; i < nreqs; i++) {
+		iolist[i].rank = i;
+		iolist[i].aio_num_threads = nreqs;
+		iolist[i].aiocbp = &aiocblist[i];
+		iolist[i].aiocbp->aio_fildes = -1;
+		iolist[i].aiocbp->aio_buf = aiobufs[i];
+		iolist[i].aiocbp->aio_nbytes = SZBUF;
+		iolist[i].aiocbp->aio_reqprio = 0;
+		iolist[i].aiocbp->aio_offset = 0;
+#if 0
+		iolist[i].aiocbp->aio_sigevent.sigev_notify = SIGEV_SIGNAL;
+		iolist[i].aiocbp->aio_sigevent.sigev_signo = SIGUSR1;
+		iolist[i].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[i];
+#else
+		iolist[i].aiocbp->aio_sigevent.sigev_notify = SIGEV_THREAD;
+		iolist[i].aiocbp->aio_sigevent.sigev_notify_function = aio_handler;
+		iolist[i].aiocbp->aio_sigevent.sigev_notify_attributes = NULL;
+		iolist[i].aiocbp->aio_sigevent.sigev_value.sival_ptr = &iolist[i];
+#endif
+	}
+
+	ret = 0;
+	return ret;
+}
+
+int my_aio_open(int aio_num_threads, struct aioreq *iolist, char **fn) {
+	int ret;
+	int i;
+	
+	for (i = 0; i < aio_num_threads; i++) {
+		iolist[i].aiocbp->aio_fildes = open(fn[i], O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH);
+		if (iolist[i].aiocbp->aio_fildes == -1) {
+			pr_err("%s: error: open %s: %s\n",
+			       __func__, fn[i], strerror(errno));
+			ret = 1;
+			goto out;
+		}
+	}
+	ret = 0;
+ out:
+	return ret;
+}
+
+int my_aio_check(struct aioreq *iolist, int aio_num_threads, char **fn) {
+	int ret;
+	int i;
+	FILE **fp = { 0 };
+	char *data;
+
+	if (!(fp = malloc(sizeof(FILE *) * aio_num_threads))) {
+		pr_err("error: allocating fp\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* Check contents */
+	for (i = 0; i < aio_num_threads; i++) {
+		if (!(data = malloc(SZBUF))) {
+			pr_err("error: allocating data\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (!(fp[i] = fopen(fn[i], "r+"))) {
+			pr_err("%s: error: fopen %s: %s\n",
+			       __func__, fn[i], strerror(errno));
+			ret = -errno;
+			goto out;
+		}
+
+		if (fread(data, sizeof(char), SZBUF, fp[i]) != SZBUF) {
+			pr_err("%s: error: fread\n",
+			       __func__);
+			ret = -1;
+			goto out;
+		}
+
+		if (memcmp((const void*)iolist[i].aiocbp->aio_buf, data, SZBUF)) {
+			pr_err("%s: Data written to file %s differs from data in memory\n",
+			       __func__, fn[i]);
+			ret = -1;
+			goto out;
+		}
+	}
+	ret = 0;
+ out:
+	for (i = 0; i < aio_num_threads; i++) {
+		fclose(fp[i]);
+	}
+
+	return ret;
+}
+
+void my_aio_close(int aio_num_threads, struct aioreq *iolist) {
+	int ret;
+	int i;
+	
+	for (i = 0; i < aio_num_threads; i++) {
+		if (iolist[i].aiocbp->aio_fildes != -1) {
+			close(iolist[i].aiocbp->aio_fildes);
+			iolist[i].aiocbp->aio_fildes = -1;
+		}
+	}
+}
+
+int my_aio(int aio_num_threads, struct aioreq *iolist, char **fn, long nsec_calc, int no_aio) {
+	int ret;
+	int i, j;
+
+	//pr_debug("%s: debug: enter\n", __func__);
+
+
+	/* Start async IO */
+	for (i = 0; i < NSAMPLES_INNER; i++) {
+		if (no_aio) goto skip1;
+
+		if ((ret = my_aio_open(aio_num_threads, iolist, fn)) == -1) {
+			pr_err("%s: error: my_aio_open: %s\n",
+			       __func__, strerror(errno));
+			ret = -errno;
+			goto out;
+		}
+		//pr_debug("%s: debug: after my_aio_open\n", __func__);
+	
+		
+		/* Reset completion */
+		completion_count = 0;
+		__sync_synchronize();
+
+		for (j = 0; j < aio_num_threads; j++) {
+			iolist[j].status = EINPROGRESS;
+
+			if ((ret = aio_write(iolist[j].aiocbp)) == -1) {
+				pr_err("%s: error: aio_write: %s\n",
+				       __func__, strerror(errno));
+				ret = -errno;
+				goto out;
+			}
+
+			//pr_debug("%s: debug: after %d-th aio_write\n", __func__, j);
+		}
+	skip1:
+		/* Emulate calcuation phase */
+		ndelay(nsec_calc);
+		if (no_aio) goto skip2;
+
+#if 0
+		int k;
+		for (k = 0; k < 20; k++) {
+			char cmd[256];
+			sprintf(cmd, "ls /proc/%d/task | wc -l", getpid());
+			system(cmd);
+			usleep(200000);
+		}
+#endif
+		
+		/* Wait for completion of async IO */
+#if WAIT_TYPE == WAIT_TYPE_SEM
+
+	retry:
+		ret = sem_wait(&aio_sem);
+		if (ret == -1) {
+			if (errno == EINTR) {
+				pr_warn("%s: warning: sem_wait interrupted\n",
+				       __func__);
+				goto retry;
+			} else {
+				pr_err("%s: error: sem_wait: %s\n",
+				       __func__, strerror(errno));
+			}
+		}
+		//pr_debug("%s: debug: completion_count: %d\n", __func__, completion_count);
+		
+#elif WAIT_TYPE == WAIT_TYPE_BUSY_LOOP
+
+		while (completion_count != aio_num_threads) {
+			for (j = 0; j < aio_num_threads; j++) {
+				if (iolist[j].status != EINPROGRESS) {
+					continue;
+				}
+				
+				iolist[j].status = aio_error(iolist[j].aiocbp);
+				
+				switch (iolist[j].status) {
+				case 0: /* Completed */
+					goto completed;
+				case EINPROGRESS:
+					break;
+				case ECANCELED:
+					pr_err("%s: error: aio is cancelled\n",
+					       __func__);
+					goto completed;
+				default:
+					pr_err("%s: error: aio_error: %s\n",
+					       __func__, strerror(iolist[j].status));
+					goto completed;
+				completed:
+					completion_count++;
+					break;
+				}
+			}
+		}
+#endif /* WAIT_TYPE */
+		/* Check amount read */
+		for (j = 0; j < aio_num_threads; j++) {
+			ssize_t size;
+			
+			if ((size = aio_return(iolist[j].aiocbp)) != SZBUF) {
+				pr_err("%s: Expected to read %ld B but #%d has read %ld B\n",
+				       __func__, SZBUF, j, size);
+				continue;
+			}
+		}
+
+		my_aio_close(aio_num_threads, iolist);
+	skip2:;
+	}
+	ret = 0;
+ out:
+	my_aio_close(aio_num_threads, iolist);
+	return ret;
+}
+
+int measure(double *result, int nsamples, int nsamples_drop, int aio_num_threads, struct aioreq *iolist, char **fn, long nsec_calc, int rank, int profile, int no_aio) {
+	int ret;
+	int i;
+	double t_l, t_g, t_sum = 0;
+	double start, end;
+	
+	for (i = 0; i < nsamples + nsamples_drop; i++) {
+		
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		/* Set parameter based on current IPC and frequency */
+		ndelay_init(0);
+		
+		start = mytime();
+		
+		struct rusage ru_start, ru_end;
+		struct timeval tv_start, tv_end;
+		
+		if (profile) {
+			if ((ret = getrusage(RUSAGE_SELF, &ru_start))) {
+				pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
+			}
+			
+			if ((ret = gettimeofday(&tv_start, NULL))) {
+				pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
+			}
+		}
+
+		if ((ret = my_aio(aio_num_threads, iolist, fn, nsec_calc, no_aio))) {
+			pr_err("%s: error: my_aio returned %d\n",
+			       __func__, ret);
+		}
+
+		if (profile) {
+			if ((ret = getrusage(RUSAGE_SELF, &ru_end))) {
+				pr_err("%s: error: getrusage failed (%d)\n", __func__, ret);
+			}
+			
+			if ((ret = gettimeofday(&tv_end, NULL))) {
+				pr_err("%s: error: gettimeofday failed (%d)\n", __func__, ret);
+			}
+			
+			if (rank == 0) pr_debug("%s: wall: %ld, user: %ld, sys: %ld\n", __func__,
+						DIFFUSEC(tv_end, tv_start),
+						DIFFUSEC(ru_end.ru_utime, ru_start.ru_utime),
+						DIFFUSEC(ru_end.ru_stime, ru_start.ru_stime));
+		}		
+		
+		end = mytime();
+		
+		MPI_Barrier(MPI_COMM_WORLD);
+
+		/* Check contents */
+		if ((ret = my_aio_check(iolist, aio_num_threads, fn))) {
+			pr_err("%s: error: my_aio_check returned %d\n",
+			       __func__, ret);
+		}
+
+		if (i < nsamples_drop) {
+			continue;
+		}
+
+		/* Take max */
+		t_l = end - start;
+		MPI_Allreduce(&t_l, &t_g, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
+		t_sum += t_g;
+	}
+
+	*result = t_sum / nsamples;
+	ret = 0;
+
+	return ret;
+}
+
+int main(int argc, char **argv)
+{
+	int ret;
+	int i, j, progress, l;
+	int rank, nproc;
+	int ppn = -1;
+	int aio_num_threads = -1;
+	int disable_syscall_intercept = 0;
+	struct aioreq *iolist;
+	struct aiocb *aiocblist;
+	struct sigaction sa;
+	double t_io_ave, t_total_ave;
+	double t_table[NROW][NCOL] = { 0 };
+	int opt;
+	char **aiobufs;
+	char **fn;
+	char src_dir[PATH_MAX];
+	char *argv0;
+
+	opterr = 0; /* Don't print out error when not recognizing option character */
+	
+	while ((opt = getopt(argc, argv, ":I:p:t:")) != -1) {
+		switch (opt) {
+		case 'I':
+			disable_syscall_intercept = atoi(optarg);
+			break;
+		case 'p':
+			ppn = atoi(optarg);
+			break;
+		case 't':
+			aio_num_threads = atoi(optarg);
+			break;
+		case '?':
+			pr_err("error: invalid option: -%c\n",
+			       optopt);
+			ret = 1;
+			goto out;
+		case ':':
+			pr_err("error: option -%c requires an argument\n",
+			       optopt);
+			ret = 1;
+			goto out;
+		}
+	}
+
+	if (ppn == -1) {
+		pr_err("error: specify ppn with -p <ppn>\n");
+		ret = 1;
+		goto out;
+	}
+
+	if (aio_num_threads == -1) {
+		pr_err("error: specify aio_num_threads with -p <aio_num_threads>\n");
+		ret = 1;
+		goto out;
+	}
+
+	/* Initialize MPI */
+	MPI_Init(&argc, &argv);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+#if 0
+	int k;
+	for (k = 0; k < 20; k++) {
+		char cmd[256];
+		sprintf(cmd, "ls /proc/%d/task | wc -l", getpid());
+		system(cmd);
+		usleep(200000);
+	}
+#endif
+
+	/* Show parameters */
+	if (rank == 0) {
+#pragma omp parallel
+		{
+			if (omp_get_thread_num() == 0) {
+				printf("nproc=%d,#threads=%d\n", nproc, omp_get_num_threads());
+			}
+		}
+	}
+
+	/* Set verbosity */
+	//test_set_loglevel(TEST_LOGLEVEL_WARN);	
+	
+	/* Set parameter based on current IPC and frequency */
+	ndelay_init(1);
+
+	/* Initialize files */
+	if (!(fn = malloc(sizeof(char *) * aio_num_threads))) {
+		pr_err("error: allocating fn\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	
+	argv0 = strdup(argv[0]);
+	sprintf(src_dir, "%s", dirname(argv0));
+	for (i = 0; i < aio_num_threads; i++) {
+                if (!(fn[i] = malloc(SZBUF))) {
+			pr_err("error: allocating data\n");
+			ret = -ENOMEM;
+			goto out;
+                }
+
+		sprintf(fn[i], "%s/rank%d-number%d", src_dir, rank, i);
+		if (rank < 2 && i < 2) {
+			pr_debug("debug: rank: %d, fn[%d]: %s\n",
+				 rank, i, fn[i]);
+		}
+	}
+
+	/* Allocate aio arrays */
+	if (!(iolist = calloc(aio_num_threads, sizeof(struct aioreq)))) {
+		pr_err("%s: error: allocating iolist\n",
+		       __func__);
+		ret = 1;
+		goto out;
+	}
+
+	if (!(aiocblist = calloc(aio_num_threads, sizeof(struct aiocb)))) {
+		pr_err("%s: error: allocating aiocblist\n",
+		       __func__);
+		ret = 1;
+		goto out;
+	}
+
+	/* Prepare data to be written */
+	if (!(aiobufs = malloc(sizeof(char *) * aio_num_threads))) {
+		pr_err("error: allocating aiobufs\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < aio_num_threads; i++) {
+		aiobufs[i] = malloc(SZBUF);
+		if (!aiobufs[i]) {
+			pr_err("%s: error: allocating aiobufs\n",
+			       __func__);
+			ret = 1;
+			goto out;
+		}
+
+		for (j = 0; j < SZBUF; j++) {
+			*(aiobufs[i] + j) = i + j + rank;
+		}
+	}
+
+	/* Initialize aio parameters except fd and status */
+	if ((ret = my_aio_init(aio_num_threads, iolist, aiocblist, aiobufs))) {
+		pr_err("%s: error: my_aio_init returned %d\n",
+		       __func__, ret);
+		goto out;
+	}
+
+#if 0
+	/* Set signal handlers */
+	sa.sa_flags = SA_RESTART | SA_SIGINFO;
+	sa.sa_sigaction = aio_sighandler;
+	if (sigaction(SIGUSR1, &sa, NULL) == -1) {
+		pr_err("%s: error: sigaction: %s\n",
+		       __func__, strerror(errno));
+		ret = 1;
+		goto out;
+	}
+#endif
+
+	/* Initialize semaphore */
+	if ((ret = sem_init(&aio_sem, 0, 0))) {
+		pr_err("%s: error: sem_init: %s\n", __func__, strerror(errno));
+		ret = -errno;
+		goto out;		
+	}
+
+	/* Take profile */
+	if ((ret = measure(&t_io_ave, NSAMPLES_PROFILE, 0, aio_num_threads, iolist, fn, 0, rank, 1, 0))) {
+		pr_err("error: measure returned %d\n", ret);
+		goto out;
+	}
+
+	/* Measure IO only time */
+	if ((ret = measure(&t_io_ave, NSAMPLES_IO, NSAMPLES_DROP, aio_num_threads, iolist, fn, 0, rank, 0, 0))) {
+		pr_err("error: measure returned %d\n", ret);
+		goto out;
+	}
+
+	if (rank == 0) {
+		printf("t_io_ave: %.0f usec, %.0f MB/s per node\n",
+		       t_io_ave * MYTIME_TOUSEC,
+		       SZBUF * ppn * aio_num_threads / t_io_ave / 1000000);
+	}
+
+	/* Measure time with no progress, progress and no uti, progress and uti */
+	for (progress = 0; progress <= (disable_syscall_intercept ? 0 : -1); progress += 1) {
+
+		if (progress == 1) {
+			/* Ignore uti_attr, spawn a thread onto compute CPUs */
+			setenv("DISABLE_UTI", "1", 1); 
+		} else if (progress == 2) {
+			unsetenv("DISABLE_UTI");
+		}
+
+		/* Increasing calculation time up to 100% of IO time */
+		for (l = 0; l <= NROW - 1; l += 1) {
+			long nsec_calc = (t_io_ave * MYTIME_TONSEC * l) / 10;
+			
+			if ((ret = measure(&t_total_ave, NSAMPLES_TOTAL, NSAMPLES_DROP, aio_num_threads, iolist, fn, nsec_calc, rank, 0, 0))) {
+				pr_err("error: measure returned %d\n", ret);
+				goto out;
+			}
+
+			if (rank == 0) {
+				if (l == 0) {
+					pr_debug("progress=%d\n", progress);
+					if (progress == 0) { 
+						pr_debug("calc\ttotal\n");
+					} else {
+						pr_debug("total\n");
+					}
+				}
+
+				t_table[l][0] = nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC);
+				if (progress == 0) { 
+					pr_debug("%.0f\t%.0f\n", nsec_calc * (MYTIME_TOUSEC / (double)MYTIME_TONSEC), t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				} else {
+					pr_debug("%.0f\n", t_total_ave * MYTIME_TOUSEC);
+					t_table[l][progress + 1] = t_total_ave * MYTIME_TOUSEC;
+				}
+			}
+		}
+	}
+
+	if (rank == 0) {
+		printf("calc,no prog,prog and no uti, prog and uti\n");
+		for (l = 0; l <= NROW - 1; l++) {
+			for (i = 0; i < NCOL; i++) {
+				if (i > 0) {
+					printf(",");
+				}
+				printf("%.0f", t_table[l][i]);
+			}
+			printf("\n");
+		}
+	}
+
+	MPI_Barrier(MPI_COMM_WORLD);
+	//pr_debug("after barrier\n");
+
+	MPI_Finalize();
+	//pr_debug("after finalize\n");
+
+	ret = 0;
+out:
+	if ((ret = sem_destroy(&aio_sem))) {
+ 		pr_err("%s: error: sem_destroy: %s\n", __func__, strerror(errno));
+		goto out;		
+	}
+
+	free(argv0);
+	return ret;
+}
diff --git a/test/uti/posix_aio/002.sh b/test/uti/posix_aio/002.sh
new file mode 100755
index 00000000..6e09a7ab
--- /dev/null
+++ b/test/uti/posix_aio/002.sh
@@ -0,0 +1,308 @@
+#!/usr/bin/bash
+
+test_dir=`pwd -P`
+mck_dir=${HOME}/project/os/install
+uti_dir_lin=${HOME}/project/uti/install_linux
+uti_dir_mck=${HOME}/project/uti/install_mckernel
+
+exe=`basename $0 | sed 's/\.sh//'`
+
+stop=0
+reboot=0
+go=0
+
+interactive=0
+pjsub=0
+gdb=0
+disable_syscall_intercept=0
+mck=0
+nnodes=2
+host_type=wallaby
+LASTNODE=15
+use_hfi=0
+omp_num_threads=4
+ppn=4
+aio_num_threads=1
+
+while getopts srgc:ml:N:P:o:hGI:ipL: OPT
+do
+        case ${OPT} in
+            s) stop=1
+                ;;
+            r) reboot=1
+                ;;
+	    g) go=1
+		;;
+            m) mck=1
+                ;;
+	    N) nnodes=$OPTARG
+		;;
+	    P) ppn=$OPTARG
+		;;
+	    o) omp_num_threads=$OPTARG
+		;;
+	    h) use_hfi=1
+		;;
+	    G) gdb=1
+		;;
+	    I) disable_syscall_intercept=$OPTARG
+		;;
+	    i) interactive=1
+		;;
+	    p) pjsub=1
+		;;
+	    L) LASTNODE=$OPTARG
+		;;
+            *) echo "invalid option -${OPT}" >&2
+                exit 1
+        esac
+done
+
+case $host_type in
+    wallaby) hnprefix=wallaby
+	;;
+    ofp) hnprefix=c
+	;;
+    *) echo "invalid host_type $host_type"
+	exit 1
+esac
+
+nprocs=$((ppn * nnodes))
+nodes="$hnprefix`echo $(seq -s ",$hnprefix" $(($LASTNODE + 1 - $nnodes)) $LASTNODE)`"
+
+case $host_type in
+    wallaby)
+	uti_cpu_set_lin=0,16,8,24
+	exclude_list=0,16,8,24
+	uti_cpu_set_mck=0,16,8,24
+	;;
+    ofp)
+	# vertical cut, excluding phys loaded with Linux tasks
+	uti_cpu_set_lin=1,69,137,205,18-19,86-87,154-155,222-223
+	exclude_list=0-1,68-69,136-137,204-205,18-19,86-87,154-155,222-223
+	#64-67,132-135,200-203,268-271 
+	
+	uti_cpu_set_mck=1,69,137,205,18-19,86-87,154-155,222-223
+	
+	# horizontal cut, excluding phys loaded with Linux tasks for mckernel
+	#uti_cpu_set_lin=204-271 
+	#uti_cpu_set_mck=1-67
+	;;
+    *) echo "invalid host_type $host_type"
+	exit 1
+esac
+
+if [ $mck -eq 0 ]; then
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_lin"
+    i_mpi_pin_processor_exclude_list="export I_MPI_PIN_PROCESSOR_EXCLUDE_LIST=$exclude_list"
+else
+    uti_cpu_set_str="export UTI_CPU_SET=$uti_cpu_set_mck"
+    i_mpi_pin_processor_exclude_list=
+fi
+
+if [ ${mck} -eq 1 ]; then
+    i_mpi_pin=off
+    i_mpi_pin_domain=
+    i_mpi_pin_order=
+#    if [ $omp_num_threads -eq 1 ]; then
+#	# Avoid binding main thread and uti thread to one CPU
+	kmp_affinity="export KMP_AFFINITY=disabled" 
+#    else
+#	# Bind rank to OMP_NUM_THREAD-sized CPU-domain
+#	kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+#    fi
+else
+    i_mpi_pin=on
+    domain=$omp_num_threads # Use 32 when you want to match mck's -n division
+    i_mpi_pin_domain="export I_MPI_PIN_DOMAIN=$domain"
+    i_mpi_pin_order="export I_MPI_PIN_ORDER=compact"
+    kmp_affinity="export KMP_AFFINITY=granularity=thread,scatter"
+fi
+
+echo nprocs=$nprocs nnodes=$nnodes ppn=$ppn nodes=$nodes domain=$domain
+
+if [ ${mck} -eq 1 ]; then
+    makeopt="UTI_DIR=$uti_dir_mck"
+    use_mck="#PJM -x MCK=$mck_dir"
+    mck_mem="#PJM -x MCK_MEM=32G@0,8G@1"
+    mcexec="${mck_dir}/bin/mcexec"
+    nmcexecthr=$((omp_num_threads + 1 + aio_num_threads * 2 + 2))
+    mcexecopt="-n $ppn -t $nmcexecthr" # --uti-use-last-cpu
+
+    if [ ${use_hfi} -eq 1 ]; then
+	mcexecopt="--enable-hfi1 $mcexecopt"
+    fi
+
+    if [ $disable_syscall_intercept -eq 0 ]; then
+	mcexecopt="--enable-uti $mcexecopt"
+    fi
+
+else
+    offline=`PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes lscpu 2>&1 | dshbak -c | grep Off-line`
+    if [ "$offline" != "" ]; then
+	echo "Error: Some CPUs are offline: $offline"
+	exit
+    fi
+
+    makeopt="UTI_DIR=$uti_dir_lin"
+    use_mck=
+    mck_mem=
+    mcexec=
+    mcexecopt=
+fi
+
+if [ $gdb -eq 1 ]; then
+    enable_x="-enable-x"
+    gdbcmd="xterm -display localhost:11 -hold -e gdb -ex run --args"
+fi
+
+if [ $interactive -eq 1 ]; then
+    i_mpi_hydra_bootstrap_exec=
+    i_mpi_hydra_bootstrap=
+    hosts=
+    ssh=
+else
+#    PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes bash -c \'if \[ \"\`cat /etc/mtab \| while read line\; do cut -d\" \" -f 2\; done \| grep /work\`\" == \"\" \]\; then sudo mount /work\; fi\'
+    i_mpi_hydra_bootstrap_exec="export I_MPI_HYDRA_BOOTSTRAP_EXEC=/usr/bin/ssh"
+    i_mpi_hydra_bootstrap="export I_MPI_HYDRA_BOOTSTRAP=ssh"
+    hosts="-hosts $nodes"
+    ssh="ssh -A $(echo $nodes | cut -d',' -f1)"
+fi
+
+case $host_type in
+    wallaby)
+	i_mpi_fabrics="export I_MPI_FABRICS=shm:dapl"
+	i_mpi_tmi_provider=
+
+	opt_dir=/opt/intel	
+	impiver=2018.3.222 # 1.163, 2.199, 3.222
+	;;
+    ofp)
+	i_mpi_fabrics="export I_MPI_FABRICS=shm:tmi"
+	i_mpi_tmi_provider="export I_MPI_TMI_PROVIDER=psm2"
+
+	if [ $interactive -eq 1 ]; then
+	    opt_dir=/opt/intel
+	else
+	    opt_dir=/home/opt/local/cores/intel
+	fi
+	impiver=2018.1.163 # 1.163, 2.199, 3.222
+	;;
+    *) echo "invalid host_type $host_type"
+	exit 1
+esac
+
+# If using ssh
+if [ $pjsub -eq 0 ] && [ $interactive -eq 0 ]; then
+    compilervars=". ${opt_dir}/compilers_and_libraries_${impiver}/linux/bin/compilervars.sh intel64"
+else
+    compilervars=
+fi
+
+if [ ${stop} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof mcexec \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    sudo ${mck_dir}/sbin/mcstop+release.sh
+    else
+	:
+    fi
+fi
+
+if [ ${reboot} -eq 1 ]; then
+    if [ ${mck} -eq 1 ]; then
+	case $host_type in
+	    wallaby) hnprefix=wallaby
+		PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 1-7,17-23,9-15,25-31 -r 1-7:0+17-23:16+9-15:8+25-31:24 -m 10G@0,10G@1
+		#PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 1-4 -r 1-4:0 -m 10G@0,10G@1
+		;;
+	    ofp)
+		# -h: Prevent unnessary CPU resource division for KNL 
+		PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+		    sudo ${mck_dir}/sbin/mcreboot.sh -h -O -c 2-17,70-85,138-153,206-221,20-35,88-103,156-171,224-239,36-51,104-119,172-187,240-255,52-67,120-135,188-203,256-271 -r 2-5,70-73,138-141,206-209:0+6-9,74-77,142-145,210-213:1+10-13,78-81,146-149,214-217:68+14-17,82-85,150-153,218-221:69+20-23,88-91,156-159,224-227:136+24-27,92-95,160-163,228-231:137+28-31,96-99,164-167,232-235:204+32-35,100-103,168-171,236-239:205+36-39,104-107,172-175,240-243:18+40-43,108-111,176-179,244-247:19+44-47,112-115,180-183,248-251:86+48-51,116-119,184-187,252-255:87+52-55,120-123,188-191,256-259:154+56-59,124-127,192-195,260-263:155+60-63,128-131,196-199,264-267:222+64-67,132-135,200-203,268-271:223 -m 32G@0,12G@1
+		;;
+	    *) echo "invalid host_type $host_type"
+		exit 1
+	esac
+    else
+	:
+    fi
+fi
+
+(
+cat <<EOF
+#!/bin/sh
+
+#PJM -L rscgrp=$rg
+#PJM -L node=$nnodes
+#PJM --mpi proc=$nprocs
+#PJM -L elapse=$elapse
+#PJM -L proc-crproc=16384 
+#PJM -g gg10
+#PJM -j
+#PJM -s
+$use_mck
+$mck_mem
+
+$i_mpi_hydra_bootstrap_exec
+$i_mpi_hydra_bootstrap
+
+export OMP_NUM_THREADS=$omp_num_threads
+#export OMP_STACKSIZE=64M
+export KMP_BLOCKTIME=1
+
+$uti_cpu_set_str
+export I_MPI_PIN=$i_mpi_pin
+$i_mpi_pin_processor_exclude_list
+$i_mpi_pin_domain
+$i_mpi_pin_order
+$kmp_affinity
+
+export HFI_NO_CPUAFFINITY=1
+export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
+$i_mpi_fabrics
+$i_mpi_tmi_provider
+export I_MPI_FALLBACK=0
+export PSM2_RCVTHREAD=0
+export PSM2_MQ_RNDV_HFI_WINDOW=4194304
+export PSM2_MQ_EAGER_SDMA_SZ=65536
+export PSM2_MQ_RNDV_HFI_THRESH=200000
+
+export MCKERNEL_RLIMIT_STACK=32M,16G
+export KMP_STACKSIZE=64m
+#export KMP_HW_SUBSET=64c,1t
+
+export I_MPI_ASYNC_PROGRESS=off
+
+#export I_MPI_STATS=native:20,ipm
+#export I_MPI_STATS=ipm
+#export I_MPI_DEBUG=4
+#export I_MPI_HYDRA_DEBUG=on
+
+ulimit -c unlimited 
+
+$compilervars
+mpiexec.hydra -n $nprocs -ppn $ppn $hosts $ilpopt $enable_x $gdbcmd $mcexec $mcexecopt ${test_dir}/$exe -I $disable_syscall_intercept -p $ppn -t $aio_num_threads
+#$gdbcmd $mcexec $mcexecopt ${test_dir}/$exe -I $disable_syscall_intercept -p $ppn -t $aio_num_threads
+#-l
+
+EOF
+) > ./job.sh
+chmod u+x ./job.sh
+
+if [ ${go} -eq 1 ]; then
+    if [ $pjsub -eq 1 ]; then
+	pjsub ./job.sh
+    else
+	if [ $interactive -eq 0 ]; then
+	    eval $compilervars
+	fi
+	make $makeopt ./$exe
+	PDSH_SSH_ARGS_APPEND="-tt -q" pdsh -t 2 -w $nodes \
+	    /usr/sbin/pidof $exe \| xargs -r sudo kill -9
+	$ssh ${test_dir}/job.sh
+    fi
+fi
diff --git a/test/uti/posix_aio/Makefile b/test/uti/posix_aio/Makefile
new file mode 100755
index 00000000..4f027e77
--- /dev/null
+++ b/test/uti/posix_aio/Makefile
@@ -0,0 +1,51 @@
+.SUFFIXES:	# Clear suffixes
+.ONESHELL:	# Pack all the lines and pass it to shell
+
+VMTOUCH=$(HOME)/project/src/vmtouch/install/bin/vmtouch
+
+# Specify it via *.sh 
+UTI_DIR=${HOME}/project/uti/install_linux
+
+CC=mpiicc
+LD=$(CC)
+
+CFLAGS = -g -O0 -Wall -DVMTOUCH=$(VMTOUCH)
+LDFLAGS = -lpthread -L$(UTI_DIR)/lib -Wl,-rpath -Wl,$(UTI_DIR)/lib -luti -lrt
+SRCS = $(shell ls 0*.c)
+OBJS = $(SRCS:.c=.o) util.o
+EXES = $(SRCS:.c=)
+
+define create_files =
+	for i in {1..2}; do
+		dd if=/dev/zero of=./data/$i bs=1M count=1
+	done
+endef
+
+all: $(EXES)
+
+file::
+	$(value create_files)
+
+util.o:: util.c util.h
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+001: 001.o util.o
+	$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
+
+001.o:: 001.c
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+002: 002.o util.o
+	$(LD) -o $@ $^ $(LDFLAGS) -qopenmp
+
+002.o:: 002.c
+	$(CC) $(CFLAGS) -qopenmp -c $<
+
+%: %.o
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+%.o::%.c
+	$(CC) $(CFLAGS) -c $<
+
+clean:
+	rm -f core $(EXES) $(OBJS) $(DSRCS)
diff --git a/test/uti/posix_aio/README b/test/uti/posix_aio/README
new file mode 100644
index 00000000..097b1b40
--- /dev/null
+++ b/test/uti/posix_aio/README
@@ -0,0 +1,15 @@
+=============================================
+Benchmarks of asynchronous I/O with busy CPUs
+=============================================
+
+The purpose is to show the benefit of spawning the asynchronous threads onto dedicated CPUs.
+
+---
+001
+---
+Write
+
+---
+002
+---
+Write, IO completion is notified by spawning thread
diff --git a/test/uti/posix_aio/util.c b/test/uti/posix_aio/util.c
new file mode 100644
index 00000000..673639ab
--- /dev/null
+++ b/test/uti/posix_aio/util.c
@@ -0,0 +1,133 @@
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <sched.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <string.h>
+#include <time.h>
+#include <mpi.h>
+#include "util.h"
+
+/* Messaging */
+enum test_loglevel test_loglevel = TEST_LOGLEVEL_DEBUG;
+
+/* Calculation */
+static inline void asmloop(unsigned long n) {
+	int j;
+
+	for (j = 0; j < n; j++) {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+	} 
+}
+
+#define N_INIT 10000000
+double nspw; /* nsec per work */
+
+void ndelay_init(int verbose) {
+	struct timeval start, end;
+	int rank, nproc;
+	double min, sum, max;
+
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &nproc);
+
+	//clock_gettime(TIMER_KIND, &start);
+	gettimeofday(&start, NULL);
+
+#pragma omp parallel
+	{
+		asmloop(N_INIT);
+	}
+
+	//clock_gettime(TIMER_KIND, &end);
+	gettimeofday(&end, NULL);
+
+	nspw = DIFFUSEC(end, start) * 1000 / (double)N_INIT;
+
+	if (verbose) {
+		MPI_Reduce(&nspw, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+		MPI_Reduce(&nspw, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+		MPI_Reduce(&nspw, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+		if (rank == 0) {
+			pr_debug("nspw: min=%.0f, ave=%.0f, max=%.0f\n", min, sum / nproc, max);
+		}
+	}
+}
+
+#if 1
+void ndelay(long delay_nsec) {
+	if (delay_nsec < 0) { 
+		printf("delay_nsec < 0\n");
+		return;
+	}
+#pragma omp parallel
+	{
+		asmloop(delay_nsec / nspw);
+	}
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void ndelay(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(TIMER_KIND, &start);
+
+	while (1) {
+		clock_gettime(TIMER_KIND, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		asmloop(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+
+double cycpw; /* cyc per work */
+
+void cdlay_init() {
+	unsigned long start, end;
+
+	start = rdtsc_light();
+#define N_INIT 10000000
+	asmloop(N_INIT);
+	end = rdtsc_light();
+	cycpw = (end - start) / (double)N_INIT;
+}
+
+#if 0
+void cdelay(long delay_cyc) {
+	if (delay_cyc < 0) { 
+		return;
+	}
+	asmloop(delay_cyc / cycpw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void cdelay(long delay_cyc) {
+	unsigned long start, end;
+	
+	if (delay_cyc < 0) { return; }
+	start = rdtsc_light();
+
+	while (1) {
+		end = rdtsc_light();
+		if (end - start >= delay_cyc) {
+			break;
+		}
+		asmloop(2);
+	}
+}
+#endif
diff --git a/test/uti/posix_aio/util.h b/test/uti/posix_aio/util.h
new file mode 100644
index 00000000..48b53fcd
--- /dev/null
+++ b/test/uti/posix_aio/util.h
@@ -0,0 +1,70 @@
+#ifndef __UTIL_H_INCLUDED__
+#define __UTIL_H_INCLUDED__
+
+#include <stdint.h>
+
+/* Messaging */
+
+enum test_loglevel {
+	TEST_LOGLEVEL_ERR = 0,
+	TEST_LOGLEVEL_WARN,
+	TEST_LOGLEVEL_DEBUG
+};
+
+extern enum test_loglevel test_loglevel;
+static inline void test_set_loglevel(enum test_loglevel level)
+{
+	test_loglevel = level;
+}
+
+#define pr_level(level, fmt, args...) do {	\
+	if (test_loglevel >= level) {	\
+		fprintf(stdout, fmt, ##args);	\
+	}					\
+} while (0)
+
+#define pr_err(fmt, args...) pr_level(TEST_LOGLEVEL_ERR, fmt, ##args)
+#define pr_warn(fmt, args...) pr_level(TEST_LOGLEVEL_WARN, fmt, ##args)
+#define pr_debug(fmt, args...) pr_level(TEST_LOGLEVEL_DEBUG, fmt, ##args)
+
+#define _OKNG(verb, jump, cond, fmt, args...) do {	\
+	if (cond) {					\
+		if (verb)				\
+			printf("[ OK ] " fmt, ##args);	\
+	} else {					\
+		printf("[ NG ] " fmt, ##args);		\
+		if (jump) {				\
+			ret = -1;			\
+			goto out;			\
+		}					\
+	}						\
+} while (0)
+
+#define OKNG(args...) _OKNG(1, 1, ##args)
+#define NG(args...) _OKNG(0, 1, ##args)
+#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args)
+
+/* Time */
+inline uint64_t rdtsc_light(void)
+{
+    uint64_t x;
+    __asm__ __volatile__("rdtscp;" /* rdtscp don't jump over earlier instructions */
+                         "shl $32, %%rdx;"
+                         "or %%rdx, %%rax" :
+                         "=a"(x) :
+                         :    
+                         "%rcx", "%rdx", "memory");
+    return x;
+}
+
+#define DIFFUSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000UL + (end.tv_usec - start.tv_usec))
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */
+
+/* Calculation emulation */
+void ndelay_init();
+void ndelay(long delay_nsec);
+void cdelay_init();
+void cdelay(long delay_cyc);
+
+#endif
diff --git a/test/uti/preloadlib.c b/test/uti/preloadlib.c
new file mode 100644
index 00000000..7b8ba350
--- /dev/null
+++ b/test/uti/preloadlib.c
@@ -0,0 +1,40 @@
+#include <libsyscall_intercept_hook_point.h>
+#include <syscall.h>
+#include <errno.h>
+#define __USE_GNU
+#include <dlfcn.h>
+
+static int
+hook(long syscall_number,
+	 long arg0, long arg1,
+	 long arg2, long arg3,
+	 long arg4, long arg5,
+	 long *result)
+{
+	if (syscall_number == SYS_getdents) {
+		/*
+		 * Prevent the application from
+		 * using the getdents syscall. From
+		 * the point of view of the calling
+		 * process, it is as if the kernel
+		 * would return the ENOTSUP error
+		 * code from the syscall.
+		 */
+		*result = -ENOTSUP;
+		return 0;
+	} else {
+		/*
+		 * Ignore any other syscalls
+		 * i.e.: pass them on to the kernel
+		 * as would normally happen.
+		 */
+		return 1;
+	}
+}
+
+static __attribute__((constructor)) void
+init(void)
+{
+	// Set up the callback function
+	intercept_hook_point = hook;
+}
diff --git a/test/uti/psm2/Makefile b/test/uti/psm2/Makefile
new file mode 100755
index 00000000..4fcb442c
--- /dev/null
+++ b/test/uti/psm2/Makefile
@@ -0,0 +1,27 @@
+.SUFFIXES:	# Clear suffixes
+
+CC=gcc
+
+LD=$(CC)
+
+CFLAGS = -g -O2
+LDFLAGS = -lpthread -lpsm2
+SRCS = $(shell ls *.c)
+OBJS = $(SRCS:.c=.o)
+EXES = $(SRCS:.c=)
+TMPFILES = $(shell ls psm2-demo-*)
+
+all: $(EXES) file
+
+file::$(TMPFILES)
+	rm -f $(TMPFILES)
+
+%: %.o
+	$(LD) -o $@ $^ $(LDFLAGS)
+
+%.o::%.c
+	$(CC) $(CFLAGS) -c $<
+
+clean:
+	rm -f core $(EXES) $(OBJS) $(DSRCS)
+
diff --git a/test/uti/psm2/psm2-demo.c b/test/uti/psm2/psm2-demo.c
new file mode 100644
index 00000000..955d76bd
--- /dev/null
+++ b/test/uti/psm2/psm2-demo.c
@@ -0,0 +1,212 @@
+/*
+ * PSM2 example program.
+ * Start two instances of this program from the same working directory.
+ * These processes can execute on the same host, or on two hosts connected
+ * with OPA.
+ * Compile with: gcc psm2-demo.c -o psm2-demo -lpsm2
+ *     Run as: ./psm2-demo -s # this is the server process
+ *     and: ./psm2-demo    # this is the client process
+ *     Copyright(c) 2015 Intel Corporation.
+ *     */
+#include <stdio.h>
+#include <psm2.h>     /* required for core PSM2 functions */
+#include <psm2_mq.h>  /* required for PSM2 MQ functions (send, recv, etc) */
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+
+#define BUFFER_LENGTH 8000000
+#define CONNECT_ARRAY_SIZE 8
+void die(char *msg, int rc) {
+  fprintf(stderr, "%s: %d\n", msg, rc);
+  exit(1);
+}
+
+/* Helper functions to find the server's PSM2 endpoint identifier (epid). */
+psm2_epid_t find_server() {
+  FILE *fp = NULL;
+  psm2_epid_t server_epid = 0;
+  printf("PSM2 client waiting for epid mapping file to appear...\n");
+  while (!fp) {
+    sleep(1);
+    fp = fopen("psm2-demo-server-epid", "r");
+  }
+  fscanf(fp, "%lx", &server_epid);
+  fclose(fp);
+  printf("PSM2 client found server epid = 0x%lx\n", server_epid);
+  return server_epid;
+}
+
+void write_epid_to_file(psm2_epid_t myepid) {
+  FILE *fp;
+  fp = fopen("psm2-demo-server-epid", "w");
+  if (!fp) {
+    fprintf(stderr,
+            "Exiting, couldn't write server's epid mapping file: ");
+    die(strerror(errno), errno);
+  }
+  fprintf(fp, "0x%lx", myepid);
+  fclose(fp);
+  printf("PSM2 server wrote epid = 0x%lx to file.\n", myepid);
+  return;
+}
+
+int main(int argc, char **argv) {
+  struct psm2_ep_open_opts o;
+  psm2_uuid_t uuid;
+  psm2_ep_t myep;
+  psm2_epid_t myepid;
+  psm2_epid_t server_epid;
+  psm2_epid_t epid_array[CONNECT_ARRAY_SIZE];
+  int epid_array_mask[CONNECT_ARRAY_SIZE];
+  psm2_error_t epid_connect_errors[CONNECT_ARRAY_SIZE];
+  psm2_epaddr_t epaddr_array[CONNECT_ARRAY_SIZE];
+  int rc;
+  int ver_major = PSM2_VERNO_MAJOR;
+  int ver_minor = PSM2_VERNO_MINOR;
+  char msgbuf[BUFFER_LENGTH];
+  psm2_mq_t q;
+  psm2_mq_req_t req_mq;
+  int is_server = 0;
+  if (argc > 2) {
+    die("To run in server mode, invoke as ./psm2-demo -s\n" \
+        "or run in client mode, invoke as ./psm2-demo\n" \
+        "Wrong number of args", argc);
+  }
+  is_server = argc - 1; /* Assume any command line argument is -s */
+  memset(uuid, 0, sizeof(psm2_uuid_t)); /* Use a UUID of zero */
+/* Try to initialize PSM2 with the requested library version.
+ *  * In this example, given the use of the PSM2_VERNO_MAJOR and MINOR
+ *   * as defined in the PSM2 headers, ensure that we are linking with
+ *    * the same version of PSM2 as we compiled against. */
+
+  if ((rc = psm2_init(&ver_major, &ver_minor)) != PSM2_OK) {
+    die("couldn't init", rc);
+  }
+  printf("PSM2 init done.\n");
+  /* Setup the endpoint options struct */
+  if ((rc = psm2_ep_open_opts_get_defaults(&o)) != PSM2_OK) {
+    die("couldn't set default opts", rc);
+  }
+  printf("PSM2 opts_get_defaults done.\n");
+  /* Attempt to open a PSM2 endpoint. This allocates hardware resources. */
+  if ((rc = psm2_ep_open(uuid, &o, &myep, &myepid)) != PSM2_OK) {
+    die("couldn't psm2_ep_open()", rc);
+  }
+  printf("PSM2 endpoint open done.\n");
+  if (is_server) {
+    write_epid_to_file(myepid);
+  } else {
+    server_epid = find_server();
+  }
+  if (is_server) {
+    /* Server does nothing here. A connection does not have to be
+ *      * established to receive messages. */
+    printf("PSM2 server up.\n");
+  } else {
+    /* Setup connection request info */
+    /* PSM2 can connect to a single epid per request,
+ *      * or an arbitrary number of epids in a single connect call.
+ *           * For this example, use part of an array of
+ *                * connection requests. */
+    memset(epid_array_mask, 0, sizeof(int) * CONNECT_ARRAY_SIZE);
+    epid_array[0] = server_epid;
+    epid_array_mask[0] = 1;
+    /* Begin the connection process.
+ *      * note that if a requested epid is not responding,
+ *           * the connect call will still return OK.
+ *                * The errors array will contain the state of individual
+ *                     * connection requests. */
+    if ((rc = psm2_ep_connect(myep,
+                              CONNECT_ARRAY_SIZE,
+                              epid_array,
+                              epid_array_mask,
+                              epid_connect_errors,
+                              epaddr_array,
+                              0 /* no timeout */
+    )) != PSM2_OK) {
+      die("couldn't ep_connect", rc);
+    }
+    printf("PSM2 connect request processed.\n");
+    /* Now check if our connection to the server is ready */
+    if (epid_connect_errors[0] != PSM2_OK) {
+      die("couldn't connect to server",
+          epid_connect_errors[0]);
+    }
+    printf("PSM2 client-server connection established.\n");
+  }
+  /* Setup our PSM2 message queue */
+  if ((rc = psm2_mq_init(myep, PSM2_MQ_ORDERMASK_NONE, NULL, 0, &q))
+      != PSM2_OK) {
+    die("couldn't initialize PSM2 MQ", rc);
+  }
+  printf("PSM2 MQ init done.\n");
+  if (is_server) {
+    psm2_mq_tag_t t = {0xABCD};
+    psm2_mq_tag_t tm = {-1};
+    /* Post the receive request */
+    if ((rc = psm2_mq_irecv2(q, PSM2_MQ_ANY_ADDR,
+                            &t, /* message tag */
+                            &tm, /* message tag mask */
+                            0, /* no flags */
+                            msgbuf, BUFFER_LENGTH,
+                            NULL, /* no context to add */
+                            &req_mq /* track irecv status */
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_irecv()", rc);
+    }
+    printf("PSM2 MQ irecv() posted\n");
+    /* Wait until the message arrives */
+    if ((rc = psm2_mq_wait(&req_mq, NULL)) != PSM2_OK) {
+      die("couldn't wait for the irecv", rc);
+    }
+    printf("PSM2 MQ wait() done.\n");
+    printf("Message from client:\n");
+    printf("%s", msgbuf);
+    unlink("psm2-demo-server-epid");
+  } else {
+    /* Say hello */
+    snprintf(msgbuf, BUFFER_LENGTH,
+             "Hello world from epid=0x%lx, pid=%d.\n",
+             myepid, getpid());
+    psm2_mq_tag_t t = {0xABCD};
+    if ((rc = psm2_mq_send2(q,
+                           epaddr_array[0], /* destination epaddr */
+                           PSM2_MQ_FLAG_SENDSYNC, /* no flags */
+                           &t, /* tag */
+                           msgbuf, BUFFER_LENGTH
+    )) != PSM2_OK) {
+      die("couldn't post psm2_mq_isend", rc);
+    }
+    printf("PSM2 MQ send() done.\n");
+  }
+/* Close down the MQ */
+  if ((rc = psm2_mq_finalize(q)) != PSM2_OK) {
+    die("couldn't psm2_mq_finalize()", rc);
+  }
+  printf("PSM2 MQ finalized.\n");
+/* Close our ep, releasing all hardware resources.
+ *  * Try to close all connections properly */
+  if ((rc = psm2_ep_close(myep, PSM2_EP_CLOSE_GRACEFUL,
+                          0 /* no timeout */)) != PSM2_OK) {
+    die("couldn't psm2_ep_close()", rc);
+  }
+  printf("PSM2 ep closed.\n");
+  /* Release all local PSM2 resources */
+  if ((rc = psm2_finalize()) != PSM2_OK) {
+    die("couldn't psm2_finalize()", rc);
+  }
+  printf("PSM2 shut down, exiting.\n");
+  return 0;
+}
+    
+
+
+
+
+
+
+
+
diff --git a/test/uti/util.c b/test/uti/util.c
new file mode 100644
index 00000000..7e1965d5
--- /dev/null
+++ b/test/uti/util.c
@@ -0,0 +1,130 @@
+#define _GNU_SOURCE         /* See feature_test_macros(7) */
+#include <unistd.h>
+#include <sys/syscall.h>   /* For SYS_xxx definitions */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <time.h>
+#include "util.h"
+
+static inline void fixed_size_work() {
+	asm volatile(
+	    "movq $0, %%rcx\n\t"
+		"1:\t"
+		"addq $1, %%rcx\n\t"
+		"cmpq $99, %%rcx\n\t"
+		"jle 1b\n\t"
+		:
+		: 
+		: "rcx", "cc");
+}
+
+static inline void bulk_fsw(unsigned long n) {
+	int j;
+	for (j = 0; j < (n); j++) {
+		fixed_size_work(); 
+	} 
+}
+
+double nspw; /* nsec per work */
+unsigned long nsec;
+
+void fwq_init() {
+	struct timespec start, end;
+	int i;
+	clock_gettime(TIMER_KIND, &start);
+#define N_INIT 10000000
+	bulk_fsw(N_INIT);
+	clock_gettime(TIMER_KIND, &end);
+	nsec = DIFFNSEC(end, start);
+	nspw = nsec / (double)N_INIT;
+}
+
+#if 1
+void fwq(long delay_nsec) {
+	if (delay_nsec < 0) { 
+        return;
+	}
+	bulk_fsw(delay_nsec / nspw);
+}
+#else /* For machines with large core-to-core performance variation (e.g. OFP) */
+void fwq(long delay_nsec) {
+	struct timespec start, end;
+	
+	if (delay_nsec < 0) { return; }
+	clock_gettime(TIMER_KIND, &start);
+
+	while (1) {
+		clock_gettime(TIMER_KIND, &end);
+		if (DIFFNSEC(end, start) >= delay_nsec) {
+			break;
+		}
+		bulk_fsw(2); /* ~150 ns per iteration on FOP */
+	}
+}
+#endif
+
+int print_cpu_last_executed_on(const char *name) {
+	char fn[256];
+	char* result;
+	pid_t tid = syscall(SYS_gettid);
+	int fd;
+	int offset;
+    int mpi_errno = 0;
+
+	sprintf(fn, "/proc/%d/task/%d/stat", getpid(), (int)tid);
+	//printf("fn=%s\n", fn);
+	fd = open(fn, O_RDONLY);
+	if(fd == -1) {
+		printf("open() failed\n");
+		goto fn_fail;
+	}
+
+	result = malloc(65536);
+	if(result == NULL) {
+		printf("malloc() failed");
+		goto fn_fail;
+	}
+
+	int amount = 0;
+	offset = 0;
+	while(1) {
+		amount = read(fd, result + offset, 65536);
+		//		printf("amount=%d\n", amount);
+		if(amount == -1) {
+			printf("read() failed");
+			goto fn_fail;
+		}
+		if(amount == 0) {
+			goto eof;
+		}
+		offset += amount;
+	}
+ eof:;
+    //printf("result:%s\n", result);
+
+	char* next_delim = result;
+	char* field;
+	int i;
+	for(i = 0; i < 39; i++) {
+		field = strsep(&next_delim, " ");
+	}
+
+	int cpu = sched_getcpu();
+	if(cpu == -1) {
+		printf("getcpu() failed\n");
+		goto fn_fail;
+	}
+
+	printf("[INFO] %s (tid: %d) is running on %02d,%02d\n", name, tid, atoi(field), cpu);
+ fn_exit:
+    free(result);
+    return mpi_errno;
+ fn_fail:
+	mpi_errno = -1;
+    goto fn_exit;
+}
+
diff --git a/test/uti/util.h b/test/uti/util.h
new file mode 100644
index 00000000..396f5183
--- /dev/null
+++ b/test/uti/util.h
@@ -0,0 +1,70 @@
+#ifndef __UTIL_H_INCLUDED__
+#define __UTIL_H_INCLUDED__
+
+#include <stdint.h>
+
+#define DEBUG
+
+#ifdef DEBUG
+#define dprintf(...) do {			 \
+	char msg[1024];			 \
+	sprintf(msg, __VA_ARGS__);		 \
+	fprintf(stderr, "%s,%s", __func__, msg); \
+} while (0)
+#else
+#define dprintf(...) do {  } while (0)
+#endif
+
+#define eprintf(...) do {			 \
+	char msg[1024];			 \
+	sprintf(msg, __VA_ARGS__);		 \
+	fprintf(stderr, "%s,%s", __func__, msg); \
+} while (0)
+
+#define CHKANDJUMP(cond, err, ...) do { \
+	if (cond) {			\
+		eprintf(__VA_ARGS__);   \
+		ret = err;		\
+		goto fn_fail;		\
+	}				\
+} while (0)
+
+#define _OKNG(verb, jump, cond, fmt, args...) do {	\
+	if (cond) {					\
+		if (verb)				\
+			printf("[ OK ] " fmt, ##args);	\
+	} else {					\
+		printf("[ NG ] " fmt, ##args);		\
+		if (jump)				\
+			goto fn_fail;			\
+	}						\
+} while (0)
+
+#define OKNG(args...) _OKNG(1, 1, ##args)
+#define NG(args...) _OKNG(0, 1, ##args)
+#define OKNGNOJUMP(args...) _OKNG(1, 0, ##args)
+
+#define DIFFNSEC(end, start) ((end.tv_sec - start.tv_sec) * 1000000000UL + (end.tv_nsec - start.tv_nsec))
+#define TIMER_KIND CLOCK_MONOTONIC_RAW /* CLOCK_THREAD_CPUTIME_ID */
+
+static inline uint64_t rdtsc_light(void )
+{
+    uint64_t x;
+    __asm__ __volatile__("rdtscp;" /* rdtscp works as instruction execution barrier */
+                         "shl $32, %%rdx;"
+                         "or %%rdx, %%rax" :
+                         "=a"(x) :
+                         :    
+                         "%rcx", "%rdx", "memory");
+    return x;
+}
+
+extern double nspw; /* nsec per work */
+extern unsigned long nsec;
+
+void fwq_init();
+void fwq(long delay_nsec);
+int print_cpu_last_executed_on(const char *name);
+
+#endif
+