Detect hang of McKernel in mcexec
mcexec spawns a thread which detects hang of McKernel by using ihk_os_get_eventfd(). Change-Id: I6cf0ee0c1f0c2c31a8422224b2105f64a9b9ab93
This commit is contained in:
@ -11,7 +11,7 @@ MCKERNEL_INCDIR=@MCKERNEL_INCDIR@
|
|||||||
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
|
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
|
||||||
KDIR ?= @KDIR@
|
KDIR ?= @KDIR@
|
||||||
ARCH=@ARCH@
|
ARCH=@ARCH@
|
||||||
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR}
|
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} -I@abs_builddir@/../../../ihk/linux/include
|
||||||
LDFLAGS=@LDFLAGS@
|
LDFLAGS=@LDFLAGS@
|
||||||
RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}')
|
RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}')
|
||||||
VPATH=@abs_srcdir@
|
VPATH=@abs_srcdir@
|
||||||
@ -19,7 +19,7 @@ TARGET=mcexec libsched_yield ldump2mcdump.so
|
|||||||
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
|
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
|
||||||
LIBS=@LIBS@
|
LIBS=@LIBS@
|
||||||
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
|
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
|
||||||
MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread
|
MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread -L@abs_builddir@/../../../ihk/linux/user -lihk -Wl,-rpath,$(MCKERNEL_LIBDIR)
|
||||||
ENABLE_QLMPI=@ENABLE_QLMPI@
|
ENABLE_QLMPI=@ENABLE_QLMPI@
|
||||||
|
|
||||||
ifeq ($(ENABLE_QLMPI),yes)
|
ifeq ($(ENABLE_QLMPI),yes)
|
||||||
|
|||||||
@ -89,6 +89,8 @@
|
|||||||
#include <sys/un.h>
|
#include <sys/un.h>
|
||||||
#include "../include/pmi.h"
|
#include "../include/pmi.h"
|
||||||
#include "../include/qlmpi.h"
|
#include "../include/qlmpi.h"
|
||||||
|
#include <ihk/ihklib.h>
|
||||||
|
#include <sys/epoll.h>
|
||||||
|
|
||||||
//#define DEBUG
|
//#define DEBUG
|
||||||
#define ADD_ENVS_OPTION
|
#define ADD_ENVS_OPTION
|
||||||
@ -1021,6 +1023,76 @@ pid_t master_tid;
|
|||||||
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
|
||||||
pthread_barrier_t init_ready;
|
pthread_barrier_t init_ready;
|
||||||
|
|
||||||
|
pthread_attr_t watchdog_thread_attr;
|
||||||
|
pthread_t watchdog_thread;
|
||||||
|
|
||||||
|
/* Detects hang of McKernel */
|
||||||
|
static void *watchdog_thread_func(void *arg) {
|
||||||
|
int ret = 0;
|
||||||
|
int evfd = -1;
|
||||||
|
int epfd = -1;
|
||||||
|
struct epoll_event event;
|
||||||
|
struct epoll_event events[1];
|
||||||
|
|
||||||
|
if ((evfd = ihk_os_get_eventfd(0, IHK_OS_EVENTFD_TYPE_STATUS)) < 0) {
|
||||||
|
fprintf(stderr, "%s: Error: geteventfd failed (%d)\n", __FUNCTION__, evfd);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((epfd = epoll_create(1)) == -1) {
|
||||||
|
fprintf(stderr, "%s: Error: epoll_create failed (%d)\n", __FUNCTION__, epfd);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
memset(&event, 0, sizeof(struct epoll_event));
|
||||||
|
event.events = EPOLLIN;
|
||||||
|
event.data.fd = evfd;
|
||||||
|
if ((ret = epoll_ctl(epfd, EPOLL_CTL_ADD, evfd, &event)) != 0) {
|
||||||
|
fprintf(stderr, "%s: Error: epoll_ctl failed (%d)\n", __FUNCTION__, ret);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
do {
|
||||||
|
int nfd = epoll_wait(epfd, events, 1, -1);
|
||||||
|
if (nfd < 0) {
|
||||||
|
if (errno == EINTR) {
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: Error: epoll_wait failed (%s)\n", __FUNCTION__, strerror(errno));
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
} else if (nfd > 1 || nfd == 0) {
|
||||||
|
fprintf(stderr, "%s: Error: Invalid number (%d) of events\n", __FUNCTION__, nfd);
|
||||||
|
goto out;
|
||||||
|
} else {
|
||||||
|
if (events[0].data.fd == evfd) {
|
||||||
|
uint64_t counter;
|
||||||
|
ssize_t nread = read(evfd, &counter, sizeof(counter));
|
||||||
|
if (nread == 0) {
|
||||||
|
fprintf(stderr, "%s: Error: read got EOF\n", __FUNCTION__);
|
||||||
|
goto out;
|
||||||
|
} else if (nread == -1) {
|
||||||
|
fprintf(stderr, "%s: Error: read failed (%s)\n", __FUNCTION__, strerror(errno));
|
||||||
|
goto out;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "mcexec detected hang of McKernel\n");
|
||||||
|
//syscall(SYS_exit_group, 99);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} while (1);
|
||||||
|
|
||||||
|
out:
|
||||||
|
if (evfd != -1) {
|
||||||
|
close(evfd);
|
||||||
|
}
|
||||||
|
if (epfd != -1) {
|
||||||
|
close(epfd);
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
static void *main_loop_thread_func(void *arg)
|
static void *main_loop_thread_func(void *arg)
|
||||||
{
|
{
|
||||||
struct thread_data_s *td = (struct thread_data_s *)arg;
|
struct thread_data_s *td = (struct thread_data_s *)arg;
|
||||||
@ -2479,6 +2551,26 @@ int main(int argc, char **argv)
|
|||||||
|
|
||||||
init_sigaction();
|
init_sigaction();
|
||||||
|
|
||||||
|
/* Initialize watchdog thread which detects hang of McKernel */
|
||||||
|
|
||||||
|
if ((error = pthread_attr_init(&watchdog_thread_attr))) {
|
||||||
|
fprintf(stderr, "Error: pthread_attr_init failed (%d)\n", error);
|
||||||
|
close(fd);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((error = pthread_attr_setdetachstate(&watchdog_thread_attr, PTHREAD_CREATE_DETACHED))) {
|
||||||
|
fprintf(stderr, "Error: pthread_attr_getdetachstate failed (%d)\n", error);
|
||||||
|
close(fd);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((error = pthread_create(&watchdog_thread, &watchdog_thread_attr, watchdog_thread_func, NULL))) {
|
||||||
|
fprintf(stderr, "Error: pthread_create failed (%d)\n", error);
|
||||||
|
close(fd);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (init_worker_threads(fd) < 0) {
|
if (init_worker_threads(fd) < 0) {
|
||||||
perror("worker threads: ");
|
perror("worker threads: ");
|
||||||
close(fd);
|
close(fd);
|
||||||
|
|||||||
Reference in New Issue
Block a user