From 09d0a59e2241566645ff0900927a85e249bec827 Mon Sep 17 00:00:00 2001 From: Masamichi Takagi Date: Wed, 30 May 2018 18:06:07 +0900 Subject: [PATCH] Detect hang of McKernel in mcexec mcexec spawns a thread which detects hang of McKernel by using ihk_os_get_eventfd(). Change-Id: I6cf0ee0c1f0c2c31a8422224b2105f64a9b9ab93 --- executer/user/Makefile.in | 4 +- executer/user/mcexec.c | 92 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 94 insertions(+), 2 deletions(-) diff --git a/executer/user/Makefile.in b/executer/user/Makefile.in index 5cbf3c1b..3e02a1b9 100644 --- a/executer/user/Makefile.in +++ b/executer/user/Makefile.in @@ -11,7 +11,7 @@ MCKERNEL_INCDIR=@MCKERNEL_INCDIR@ MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@ KDIR ?= @KDIR@ ARCH=@ARCH@ -CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} +CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} -I@abs_builddir@/../../../ihk/linux/include LDFLAGS=@LDFLAGS@ RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}') VPATH=@abs_srcdir@ @@ -19,7 +19,7 @@ TARGET=mcexec libsched_yield ldump2mcdump.so @uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair LIBS=@LIBS@ IHKDIR ?= $(VPATH)/../../../ihk/linux/include/ -MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread +MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread -L@abs_builddir@/../../../ihk/linux/user -lihk -Wl,-rpath,$(MCKERNEL_LIBDIR) ENABLE_QLMPI=@ENABLE_QLMPI@ ifeq ($(ENABLE_QLMPI),yes) diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 4877cd06..31a5f298 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -89,6 +89,8 @@ #include #include "../include/pmi.h" #include "../include/qlmpi.h" +#include +#include //#define DEBUG #define ADD_ENVS_OPTION @@ -1021,6 +1023,76 @@ pid_t master_tid; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; pthread_barrier_t init_ready; +pthread_attr_t watchdog_thread_attr; +pthread_t watchdog_thread; + +/* Detects hang of McKernel */ +static void *watchdog_thread_func(void *arg) { + int ret = 0; + int evfd = -1; + int epfd = -1; + struct epoll_event event; + struct epoll_event events[1]; + + if ((evfd = ihk_os_get_eventfd(0, IHK_OS_EVENTFD_TYPE_STATUS)) < 0) { + fprintf(stderr, "%s: Error: geteventfd failed (%d)\n", __FUNCTION__, evfd); + goto out; + } + + if ((epfd = epoll_create(1)) == -1) { + fprintf(stderr, "%s: Error: epoll_create failed (%d)\n", __FUNCTION__, epfd); + goto out; + } + + memset(&event, 0, sizeof(struct epoll_event)); + event.events = EPOLLIN; + event.data.fd = evfd; + if ((ret = epoll_ctl(epfd, EPOLL_CTL_ADD, evfd, &event)) != 0) { + fprintf(stderr, "%s: Error: epoll_ctl failed (%d)\n", __FUNCTION__, ret); + goto out; + } + + do { + int nfd = epoll_wait(epfd, events, 1, -1); + if (nfd < 0) { + if (errno == EINTR) { + continue; + } else { + fprintf(stderr, "%s: Error: epoll_wait failed (%s)\n", __FUNCTION__, strerror(errno)); + goto out; + } + } else if (nfd > 1 || nfd == 0) { + fprintf(stderr, "%s: Error: Invalid number (%d) of events\n", __FUNCTION__, nfd); + goto out; + } else { + if (events[0].data.fd == evfd) { + uint64_t counter; + ssize_t nread = read(evfd, &counter, sizeof(counter)); + if (nread == 0) { + fprintf(stderr, "%s: Error: read got EOF\n", __FUNCTION__); + goto out; + } else if (nread == -1) { + fprintf(stderr, "%s: Error: read failed (%s)\n", __FUNCTION__, strerror(errno)); + goto out; + } else { + fprintf(stderr, "mcexec detected hang of McKernel\n"); + //syscall(SYS_exit_group, 99); + goto out; + } + } + } + } while (1); + + out: + if (evfd != -1) { + close(evfd); + } + if (epfd != -1) { + close(epfd); + } + return NULL; +} + static void *main_loop_thread_func(void *arg) { struct thread_data_s *td = (struct thread_data_s *)arg; @@ -2479,6 +2551,26 @@ int main(int argc, char **argv) init_sigaction(); + /* Initialize watchdog thread which detects hang of McKernel */ + + if ((error = pthread_attr_init(&watchdog_thread_attr))) { + fprintf(stderr, "Error: pthread_attr_init failed (%d)\n", error); + close(fd); + return 1; + } + + if ((error = pthread_attr_setdetachstate(&watchdog_thread_attr, PTHREAD_CREATE_DETACHED))) { + fprintf(stderr, "Error: pthread_attr_getdetachstate failed (%d)\n", error); + close(fd); + return 1; + } + + if ((error = pthread_create(&watchdog_thread, &watchdog_thread_attr, watchdog_thread_func, NULL))) { + fprintf(stderr, "Error: pthread_create failed (%d)\n", error); + close(fd); + return 1; + } + if (init_worker_threads(fd) < 0) { perror("worker threads: "); close(fd);