Detect hang of McKernel in mcexec

mcexec spawns a thread which detects hang of McKernel by using
ihk_os_get_eventfd().

Change-Id: I6cf0ee0c1f0c2c31a8422224b2105f64a9b9ab93
This commit is contained in:
Masamichi Takagi
2018-05-30 18:06:07 +09:00
parent 511555c8cb
commit 09d0a59e22
2 changed files with 94 additions and 2 deletions

View File

@ -11,7 +11,7 @@ MCKERNEL_INCDIR=@MCKERNEL_INCDIR@
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@ MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
KDIR ?= @KDIR@ KDIR ?= @KDIR@
ARCH=@ARCH@ ARCH=@ARCH@
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} -I@abs_builddir@/../../../ihk/linux/include
LDFLAGS=@LDFLAGS@ LDFLAGS=@LDFLAGS@
RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}') RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}')
VPATH=@abs_srcdir@ VPATH=@abs_srcdir@
@ -19,7 +19,7 @@ TARGET=mcexec libsched_yield ldump2mcdump.so
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair @uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@ LIBS=@LIBS@
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/ IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread -L@abs_builddir@/../../../ihk/linux/user -lihk -Wl,-rpath,$(MCKERNEL_LIBDIR)
ENABLE_QLMPI=@ENABLE_QLMPI@ ENABLE_QLMPI=@ENABLE_QLMPI@
ifeq ($(ENABLE_QLMPI),yes) ifeq ($(ENABLE_QLMPI),yes)

View File

@ -89,6 +89,8 @@
#include <sys/un.h> #include <sys/un.h>
#include "../include/pmi.h" #include "../include/pmi.h"
#include "../include/qlmpi.h" #include "../include/qlmpi.h"
#include <ihk/ihklib.h>
#include <sys/epoll.h>
//#define DEBUG //#define DEBUG
#define ADD_ENVS_OPTION #define ADD_ENVS_OPTION
@ -1021,6 +1023,76 @@ pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
pthread_barrier_t init_ready; pthread_barrier_t init_ready;
pthread_attr_t watchdog_thread_attr;
pthread_t watchdog_thread;
/* Detects hang of McKernel */
static void *watchdog_thread_func(void *arg) {
int ret = 0;
int evfd = -1;
int epfd = -1;
struct epoll_event event;
struct epoll_event events[1];
if ((evfd = ihk_os_get_eventfd(0, IHK_OS_EVENTFD_TYPE_STATUS)) < 0) {
fprintf(stderr, "%s: Error: geteventfd failed (%d)\n", __FUNCTION__, evfd);
goto out;
}
if ((epfd = epoll_create(1)) == -1) {
fprintf(stderr, "%s: Error: epoll_create failed (%d)\n", __FUNCTION__, epfd);
goto out;
}
memset(&event, 0, sizeof(struct epoll_event));
event.events = EPOLLIN;
event.data.fd = evfd;
if ((ret = epoll_ctl(epfd, EPOLL_CTL_ADD, evfd, &event)) != 0) {
fprintf(stderr, "%s: Error: epoll_ctl failed (%d)\n", __FUNCTION__, ret);
goto out;
}
do {
int nfd = epoll_wait(epfd, events, 1, -1);
if (nfd < 0) {
if (errno == EINTR) {
continue;
} else {
fprintf(stderr, "%s: Error: epoll_wait failed (%s)\n", __FUNCTION__, strerror(errno));
goto out;
}
} else if (nfd > 1 || nfd == 0) {
fprintf(stderr, "%s: Error: Invalid number (%d) of events\n", __FUNCTION__, nfd);
goto out;
} else {
if (events[0].data.fd == evfd) {
uint64_t counter;
ssize_t nread = read(evfd, &counter, sizeof(counter));
if (nread == 0) {
fprintf(stderr, "%s: Error: read got EOF\n", __FUNCTION__);
goto out;
} else if (nread == -1) {
fprintf(stderr, "%s: Error: read failed (%s)\n", __FUNCTION__, strerror(errno));
goto out;
} else {
fprintf(stderr, "mcexec detected hang of McKernel\n");
//syscall(SYS_exit_group, 99);
goto out;
}
}
}
} while (1);
out:
if (evfd != -1) {
close(evfd);
}
if (epfd != -1) {
close(epfd);
}
return NULL;
}
static void *main_loop_thread_func(void *arg) static void *main_loop_thread_func(void *arg)
{ {
struct thread_data_s *td = (struct thread_data_s *)arg; struct thread_data_s *td = (struct thread_data_s *)arg;
@ -2479,6 +2551,26 @@ int main(int argc, char **argv)
init_sigaction(); init_sigaction();
/* Initialize watchdog thread which detects hang of McKernel */
if ((error = pthread_attr_init(&watchdog_thread_attr))) {
fprintf(stderr, "Error: pthread_attr_init failed (%d)\n", error);
close(fd);
return 1;
}
if ((error = pthread_attr_setdetachstate(&watchdog_thread_attr, PTHREAD_CREATE_DETACHED))) {
fprintf(stderr, "Error: pthread_attr_getdetachstate failed (%d)\n", error);
close(fd);
return 1;
}
if ((error = pthread_create(&watchdog_thread, &watchdog_thread_attr, watchdog_thread_func, NULL))) {
fprintf(stderr, "Error: pthread_create failed (%d)\n", error);
close(fd);
return 1;
}
if (init_worker_threads(fd) < 0) { if (init_worker_threads(fd) < 0) {
perror("worker threads: "); perror("worker threads: ");
close(fd); close(fd);