diff --git a/arch/x86_64/kernel/include/syscall_list.h b/arch/x86_64/kernel/include/syscall_list.h index 48b1ea0a..60f01476 100644 --- a/arch/x86_64/kernel/include/syscall_list.h +++ b/arch/x86_64/kernel/include/syscall_list.h @@ -161,6 +161,7 @@ SYSCALL_HANDLED(__NR_profile, profile) SYSCALL_HANDLED(730, util_migrate_inter_kernel) SYSCALL_HANDLED(731, util_indicate_clone) SYSCALL_HANDLED(732, get_system) +SYSCALL_HANDLED(733, util_register_desc) /* McKernel Specific */ SYSCALL_HANDLED(801, swapout) diff --git a/arch/x86_64/kernel/syscall.c b/arch/x86_64/kernel/syscall.c index 5f70c193..c2ca64d1 100644 --- a/arch/x86_64/kernel/syscall.c +++ b/arch/x86_64/kernel/syscall.c @@ -1369,6 +1369,7 @@ done: return 0; } + /* Forward signal to Linux by interrupt_syscall mechanism */ if (tthread->thread_offloaded) { if (!tthread->proc->nohost) { interrupt_syscall(tthread, sig); diff --git a/configure b/configure index ff74762a..93fcc281 100755 --- a/configure +++ b/configure @@ -631,6 +631,8 @@ IHK_VERSION ENABLE_QLMPI ENABLE_RUSAGE ENABLE_MCOVERLAYFS +LDFLAGS_SYSCALL_INTERCEPT +CPPFLAGS_SYSCALL_INTERCEPT MANDIR KERNDIR KMODDIR @@ -702,6 +704,9 @@ enable_option_checking with_mpi with_mpi_include with_mpi_lib +with_syscall_intercept +with_syscall_intercept_include +with_syscall_intercept_lib with_kernelsrc with_target with_system_map @@ -1346,6 +1351,15 @@ Optional Packages: --with-mpi-include=PATH specify path where mpi include directory can be found --with-mpi-lib=PATH specify path where mpi lib directory can be found + --with-syscall_intercept=PATH + specify path where syscall_intercept include + directory and lib directory can be found + --with-syscall_intercept-include=PATH + specify path where syscall_intercept include + directory can be found + --with-syscall_intercept-lib=PATH + specify path where syscall_intercept lib directory + can be found --with-kernelsrc=path Path to 'kernel src', default is /lib/modules/uname_r/build --with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86} @@ -2082,6 +2096,8 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu + + IHK_VERSION=1.5.0 MCKERNEL_VERSION=1.5.0 DCFA_VERSION=DCFA_VERSION_m4 @@ -3513,6 +3529,119 @@ fi +# Check whether --with-syscall_intercept was given. +if test "${with_syscall_intercept+set}" = set; then : + withval=$with_syscall_intercept; case "$withval" in #( + yes|no|'') : + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: --without-syscall_intercept=PATH expects a valid PATH" >&5 +$as_echo "$as_me: WARNING: --without-syscall_intercept=PATH expects a valid PATH" >&2;} + with_syscall_intercept="" ;; #( + *) : + ;; +esac +else + with_syscall_intercept= +fi + + +# Check whether --with-syscall_intercept-include was given. +if test "${with_syscall_intercept_include+set}" = set; then : + withval=$with_syscall_intercept_include; case "$withval" in #( + yes|no|'') : + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: --without-syscall_intercept-include=PATH expects a valid PATH" >&5 +$as_echo "$as_me: WARNING: --without-syscall_intercept-include=PATH expects a valid PATH" >&2;} + with_syscall_intercept_include="" ;; #( + *) : + ;; +esac +fi + + +# Check whether --with-syscall_intercept-lib was given. +if test "${with_syscall_intercept_lib+set}" = set; then : + withval=$with_syscall_intercept_lib; case "$withval" in #( + yes|no|'') : + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: --without-syscall_intercept-lib=PATH expects a valid PATH" >&5 +$as_echo "$as_me: WARNING: --without-syscall_intercept-lib=PATH expects a valid PATH" >&2;} + with_syscall_intercept_lib="" ;; #( + *) : + ;; +esac +fi + + + # The args have been sanitized into empty/non-empty values above. + # Now append -I/-L args to CPPFLAGS/LDFLAGS, with more specific options + # taking priority + + if test -n "${with_syscall_intercept_include}"; then : + + + if echo "$CPPFLAGS_SYSCALL_INTERCEPT" | $FGREP -e "\<-I${with_syscall_intercept_include}\>" >/dev/null 2>&1; then : + echo "CPPFLAGS_SYSCALL_INTERCEPT(='$CPPFLAGS_SYSCALL_INTERCEPT') contains '-I${with_syscall_intercept_include}', not appending" >&5 +else + echo "CPPFLAGS_SYSCALL_INTERCEPT(='$CPPFLAGS_SYSCALL_INTERCEPT') does not contain '-I${with_syscall_intercept_include}', appending" >&5 + CPPFLAGS_SYSCALL_INTERCEPT="$CPPFLAGS_SYSCALL_INTERCEPT -I${with_syscall_intercept_include}" + +fi + +else + if test -n "${with_syscall_intercept}"; then : + + + if echo "$CPPFLAGS_SYSCALL_INTERCEPT" | $FGREP -e "\<-I${with_syscall_intercept}/include\>" >/dev/null 2>&1; then : + echo "CPPFLAGS_SYSCALL_INTERCEPT(='$CPPFLAGS_SYSCALL_INTERCEPT') contains '-I${with_syscall_intercept}/include', not appending" >&5 +else + echo "CPPFLAGS_SYSCALL_INTERCEPT(='$CPPFLAGS_SYSCALL_INTERCEPT') does not contain '-I${with_syscall_intercept}/include', appending" >&5 + CPPFLAGS_SYSCALL_INTERCEPT="$CPPFLAGS_SYSCALL_INTERCEPT -I${with_syscall_intercept}/include" + +fi + +fi +fi + + if test -n "${with_syscall_intercept_lib}"; then : + + + if echo "$LDFLAGS_SYSCALL_INTERCEPT" | $FGREP -e "\<-L${with_syscall_intercept_lib} -Wl,-rpath,${with_syscall_intercept_lib}\>" >/dev/null 2>&1; then : + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') contains '-L${with_syscall_intercept_lib} -Wl,-rpath,${with_syscall_intercept_lib}', not appending" >&5 +else + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') does not contain '-L${with_syscall_intercept_lib} -Wl,-rpath,${with_syscall_intercept_lib}', appending" >&5 + LDFLAGS_SYSCALL_INTERCEPT="$LDFLAGS_SYSCALL_INTERCEPT -L${with_syscall_intercept_lib} -Wl,-rpath,${with_syscall_intercept_lib}" + +fi + +else + if test -n "${with_syscall_intercept}"; then : + + + if echo "$LDFLAGS_SYSCALL_INTERCEPT" | $FGREP -e "\<-L${with_syscall_intercept}/lib -Wl,-rpath,${with_syscall_intercept}/lib\>" >/dev/null 2>&1; then : + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') contains '-L${with_syscall_intercept}/lib -Wl,-rpath,${with_syscall_intercept}/lib', not appending" >&5 +else + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') does not contain '-L${with_syscall_intercept}/lib -Wl,-rpath,${with_syscall_intercept}/lib', appending" >&5 + LDFLAGS_SYSCALL_INTERCEPT="$LDFLAGS_SYSCALL_INTERCEPT -L${with_syscall_intercept}/lib -Wl,-rpath,${with_syscall_intercept}/lib" + +fi + + if test -d "${with_syscall_intercept}/lib64"; then : + + + if echo "$LDFLAGS_SYSCALL_INTERCEPT" | $FGREP -e "\<-L${with_syscall_intercept}/lib64 -Wl,-rpath,${with_syscall_intercept}/lib64\>" >/dev/null 2>&1; then : + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') contains '-L${with_syscall_intercept}/lib64 -Wl,-rpath,${with_syscall_intercept}/lib64', not appending" >&5 +else + echo "LDFLAGS_SYSCALL_INTERCEPT(='$LDFLAGS_SYSCALL_INTERCEPT') does not contain '-L${with_syscall_intercept}/lib64 -Wl,-rpath,${with_syscall_intercept}/lib64', appending" >&5 + LDFLAGS_SYSCALL_INTERCEPT="$LDFLAGS_SYSCALL_INTERCEPT -L${with_syscall_intercept}/lib64 -Wl,-rpath,${with_syscall_intercept}/lib64" + +fi + +fi + +fi + +fi + + + # Check whether --with-kernelsrc was given. if test "${with_kernelsrc+set}" = set; then : withval=$with_kernelsrc; WITH_KERNELSRC=$withval @@ -4660,6 +4789,8 @@ fi + + diff --git a/configure.ac b/configure.ac index fdde02a9..ee122385 100644 --- a/configure.ac +++ b/configure.ac @@ -77,6 +77,54 @@ AC_DEFUN([PAC_SET_HEADER_LIB_PATH],[ ]) ]) +AC_DEFUN([PAC_SET_HEADER_LIB_PATH_SYSCALL_INTERCEPT],[ + AC_ARG_WITH([$1], + [AC_HELP_STRING([--with-$1=PATH], + [specify path where $1 include directory and lib directory can be found])], + + [AS_CASE(["$withval"], + [yes|no|''], + [AC_MSG_WARN([--with[out]-$1=PATH expects a valid PATH]) + with_$1=""])], + [with_$1=$2]) + AC_ARG_WITH([$1-include], + [AC_HELP_STRING([--with-$1-include=PATH], + [specify path where $1 include directory can be found])], + [AS_CASE(["$withval"], + [yes|no|''], + [AC_MSG_WARN([--with[out]-$1-include=PATH expects a valid PATH]) + with_$1_include=""])], + []) + AC_ARG_WITH([$1-lib], + [AC_HELP_STRING([--with-$1-lib=PATH], + [specify path where $1 lib directory can be found])], + [AS_CASE(["$withval"], + [yes|no|''], + [AC_MSG_WARN([--with[out]-$1-lib=PATH expects a valid PATH]) + with_$1_lib=""])], + []) + + # The args have been sanitized into empty/non-empty values above. + # Now append -I/-L args to CPPFLAGS/LDFLAGS, with more specific options + # taking priority + + AS_IF([test -n "${with_$1_include}"], + [PAC_APPEND_FLAG([-I${with_$1_include}],[CPPFLAGS_SYSCALL_INTERCEPT])], + [AS_IF([test -n "${with_$1}"], + [PAC_APPEND_FLAG([-I${with_$1}/include],[CPPFLAGS_SYSCALL_INTERCEPT])])]) + + AS_IF([test -n "${with_$1_lib}"], + [PAC_APPEND_FLAG([-L${with_$1_lib} -Wl,-rpath,${with_$1_lib}],[LDFLAGS_SYSCALL_INTERCEPT])], + [AS_IF([test -n "${with_$1}"], + dnl is adding lib64 by default really the right thing to do? What if + dnl we are on a 32-bit host that happens to have both lib dirs available? + [PAC_APPEND_FLAG([-L${with_$1}/lib -Wl,-rpath,${with_$1}/lib],[LDFLAGS_SYSCALL_INTERCEPT]) + AS_IF([test -d "${with_$1}/lib64"], + [PAC_APPEND_FLAG([-L${with_$1}/lib64 -Wl,-rpath,${with_$1}/lib64],[LDFLAGS_SYSCALL_INTERCEPT])]) + ]) + ]) +]) + IHK_VERSION=IHK_VERSION_m4 MCKERNEL_VERSION=MCKERNEL_VERSION_m4 DCFA_VERSION=DCFA_VERSION_m4 @@ -94,6 +142,7 @@ AS_IF([test "x$numa_lib_found" != "xyes"], [AC_MSG_ERROR([Unable to find NUMA library, missing numactl-devel?])]) PAC_SET_HEADER_LIB_PATH([mpi]) +PAC_SET_HEADER_LIB_PATH_SYSCALL_INTERCEPT([syscall_intercept]) AC_ARG_WITH([kernelsrc], AC_HELP_STRING( @@ -454,6 +503,8 @@ AC_SUBST(KMODDIR) AC_SUBST(KERNDIR) AC_SUBST(MANDIR) AC_SUBST(CFLAGS) +AC_SUBST(CPPFLAGS_SYSCALL_INTERCEPT) +AC_SUBST(LDFLAGS_SYSCALL_INTERCEPT) AC_SUBST(ENABLE_MCOVERLAYFS) AC_SUBST(ENABLE_RUSAGE) AC_SUBST(ENABLE_QLMPI) diff --git a/executer/include/uti.h b/executer/include/uti.h new file mode 100644 index 00000000..e542082b --- /dev/null +++ b/executer/include/uti.h @@ -0,0 +1,28 @@ +#ifndef UTI_H_INCLUDED +#define UTI_H_INCLUDED + +struct syscall_struct { + int number; + unsigned long args[6]; + unsigned long ret; + unsigned long uti_clv; /* copy of a clv in McKernel */ +}; + +/* Variables accessed by mcexec.c and syscall_intercept.c */ +struct uti_desc { + void *wp; /* Syscall arguments list and record of McKernel context and Linux context */ + int mck_tid; /* TODO: Move this out for multiple migrated-to-Linux threads */ + unsigned long key; /* struct task_struct* of mcexec thread, used to search struct host_thread */ + int pid, tid; /* Used as the id of tracee when issuing MCEXEC_UP_TERMINATE_THREAD */ + unsigned long uti_clv; /* copy of McKernel clv */ + + int fd; /* /dev/mcosX */ + struct syscall_struct *syscall_param_top; /* stack-pointer of syscall arguments list */ + struct syscall_struct *syscall_param; /* TODO: make it auto variable */ + long syscalls[512], syscalls2[512]; /* Syscall profile counters */ + int start_syscall_intercept; /* Used to sync between mcexec.c and syscall_intercept.c */ +}; + + +#endif + diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index 815614e4..ac2710f4 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -305,6 +305,7 @@ struct mcos_handler_info; static LIST_HEAD(host_threads); /* Used for FS switch */ DEFINE_RWLOCK(host_thread_lock); +/* Info of Linux counterpart of migrated-to-Linux thread */ struct host_thread { struct list_head list; struct mcos_handler_info *handler; @@ -2474,7 +2475,7 @@ mcexec_util_thread2(ihk_os_t os, unsigned long arg, struct file *file) exiting release_handler() */ ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); - + pr_ppd("get", task_pid_vnr(current), ppd); return 0; } @@ -2508,8 +2509,7 @@ mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file) return ret; } -static long -mcexec_terminate_thread_unsafe(ihk_os_t os, int pid, int tid, long sig, struct task_struct *tsk) +static long mcexec_terminate_thread_unsafe(ihk_os_t os, int pid, int tid, long sig, struct task_struct *tsk) { struct ikc_scd_packet *packet; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); @@ -2697,6 +2697,7 @@ long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file) (struct syscall_struct __user *)arg; long rc; + if (copy_from_user(¶m, uparam, sizeof param)) { return -EFAULT; } diff --git a/executer/kernel/mcctrl/syscall.c b/executer/kernel/mcctrl/syscall.c index 5e2a64e9..21f1b549 100644 --- a/executer/kernel/mcctrl/syscall.c +++ b/executer/kernel/mcctrl/syscall.c @@ -299,10 +299,21 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet IHK_SCD_REQ_THREAD_SPINNING, IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) { dprintk("%s: no need to send IKC message for PID %d\n", - __FUNCTION__, packet->pid); + __FUNCTION__, packet->pid); return ret; } + /* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or + IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing. + Note that mcexec_terminate_thread() and remote page fault and + returning EINTR would compete. */ + if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) { + printk("%s: INFO: someone else is waking up the McKernel thread, " + "pid: %d, req status: %lu, syscall nr: %lu\n", + __FUNCTION__, packet->pid, + res->req_thread_status, packet->req.number); + } + /* The thread is not spinning any more, make sure it's descheduled */ if (!__sync_bool_compare_and_swap(&res->req_thread_status, IHK_SCD_REQ_THREAD_DESCHEDULED, @@ -522,6 +533,23 @@ out_put_ppd: return syscall_ret; } +#if 0 /* debug */ +/* Info of Linux counterpart of migrated-to-Linux thread */ +struct host_thread { + struct host_thread *next; + struct mcos_handler_info *handler; + int pid; + int tid; + unsigned long usp; + unsigned long lfs; + unsigned long rfs; + struct task_struct *task; +}; + +extern struct host_thread *host_threads; +extern rwlock_t host_thread_lock; +#endif + int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) { struct ikc_scd_packet *packet; @@ -791,6 +819,8 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* Look up per-process structure */ ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (!ppd) { + kprintf("%s: INFO: no per-process structure for pid %d (tid %d), try to use pid %d\n", + __FUNCTION__, task_tgid_vnr(current), task_pid_vnr(current), vma->vm_mm->owner->pid); ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid); } diff --git a/executer/user/Makefile.in b/executer/user/Makefile.in index 40851ab9..835349ab 100644 --- a/executer/user/Makefile.in +++ b/executer/user/Makefile.in @@ -13,9 +13,11 @@ KDIR ?= @KDIR@ ARCH=@ARCH@ CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} -I@abs_builddir@/../../../ihk/linux/include LDFLAGS=@LDFLAGS@ +CPPFLAGS_SYSCALL_INTERCEPT=@CPPFLAGS_SYSCALL_INTERCEPT@ +LDFLAGS_SYSCALL_INTERCEPT=@LDFLAGS_SYSCALL_INTERCEPT@ RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}') VPATH=@abs_srcdir@ -TARGET=mcexec libsched_yield ldump2mcdump.so +TARGET=mcexec libsched_yield ldump2mcdump.so syscall_intercept.so @uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair LIBS=@LIBS@ IHKDIR ?= $(VPATH)/../../../ihk/linux/include/ @@ -52,6 +54,12 @@ ldump2mcdump.so: ldump2mcdump.c libsched_yield: libsched_yield.c $(CC) -shared -fPIC -Wl,-soname,sched_yield.so.1 -o libsched_yield.so.1.0.0 $^ -lc -ldl +syscall_intercept.so: syscall_intercept.c libsyscall_intercept_arch.a + $(CC) $(CPPFLAGS_SYSCALL_INTERCEPT) -g -O2 $(LDFLAGS_SYSCALL_INTERCEPT) -lsyscall_intercept -fpic -shared -L. -lsyscall_intercept_arch $^ -o $@ + +libsyscall_intercept_arch.a:: + +(cd arch/${ARCH}; $(MAKE)) + libmcexec.a:: +(cd arch/${ARCH}; $(MAKE)) @@ -100,6 +108,7 @@ ifeq ($(ENABLE_QLMPI),yes) install -m 755 ql_mpiexec_finalize $(BINDIR) install -m 755 ql_talker $(SBINDIR) endif + install -m 755 syscall_intercept.so $(MCKERNEL_LIBDIR) @uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR) @uncomment_if_ENABLE_MEMDUMP@install -m 755 vmcore2mckdump $(BINDIR) diff --git a/executer/user/arch/x86_64/Makefile.in b/executer/user/arch/x86_64/Makefile.in index b913d94e..00da3154 100644 --- a/executer/user/arch/x86_64/Makefile.in +++ b/executer/user/arch/x86_64/Makefile.in @@ -4,7 +4,7 @@ BINDIR=@BINDIR@ KDIR ?= @KDIR@ CFLAGS=-Wall -O -I. VPATH=@abs_srcdir@ -TARGET=../../libmcexec.a +TARGET=../../libmcexec.a ../../libsyscall_intercept_arch.a LIBS=@LIBS@ all: $(TARGET) @@ -18,6 +18,12 @@ archdep.o: archdep.S arch_syscall.o: arch_syscall.c $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $< +../../libsyscall_intercept_arch.a: archdep_c.o + $(AR) cr ../../libsyscall_intercept_arch.a archdep_c.o + +archdep_c.o: archdep_c.c + $(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $< + clean: $(RM) $(TARGET) *.o diff --git a/executer/user/arch/x86_64/archdep.S b/executer/user/arch/x86_64/archdep.S index c4da1ef6..89bd4021 100644 --- a/executer/user/arch/x86_64/archdep.S +++ b/executer/user/arch/x86_64/archdep.S @@ -146,4 +146,3 @@ compare_and_swap_int: lock cmpxchgl %edx,0(%rdi) retq - diff --git a/executer/user/arch/x86_64/archdep_c.c b/executer/user/arch/x86_64/archdep_c.c new file mode 100644 index 00000000..2d650f77 --- /dev/null +++ b/executer/user/arch/x86_64/archdep_c.c @@ -0,0 +1,52 @@ +/* +function call convention +rdi, rsi, rdx, rcx, r8, r9: IN arguments +rax: OUT return value + +syscall convention: +rax: IN syscall number +rdi, rsi, rdx, r10, r8, r9: IN arguments +rax: OUT return value +rcx, r11: CLOBBER +*/ +long uti_syscall6(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5) +{ + long ret; + asm volatile ("movq %[arg3],%%r10; movq %[arg4],%%r8; movq %[arg5],%%r9; syscall" + : "=a" (ret) + : "a" (syscall_number), + "D" (arg0), "S" (arg1), "d" (arg2), + [arg3] "g" (arg3), [arg4] "g" (arg4), [arg5] "g" (arg5) + : "rcx", "r11", "r10", "r8", "r9", "memory"); + return ret; +} + +long uti_syscall3(long syscall_number, long arg0, long arg1, long arg2) +{ + long ret; + asm volatile ("syscall" + : "=a" (ret) + : "a" (syscall_number), "D" (arg0), "S" (arg1), "d" (arg2) + : "rcx", "r11", "memory"); + return ret; +} + +long uti_syscall1(long syscall_number, long arg0) +{ + long ret; + asm volatile ("syscall" + : "=a" (ret) + : "a" (syscall_number), "D" (arg0) + : "rcx", "r11", "memory"); + return ret; +} + +long uti_syscall0(long syscall_number) +{ + long ret; + asm volatile ("syscall" + : "=a" (ret) + : "a" (syscall_number) + : "rcx", "r11", "memory"); + return ret; +} diff --git a/executer/user/archdep_uti.h b/executer/user/archdep_uti.h new file mode 100644 index 00000000..c3c33f31 --- /dev/null +++ b/executer/user/archdep_uti.h @@ -0,0 +1,5 @@ +extern long uti_syscall6(long syscall_number, long arg0, long arg1, long arg2, long arg3, long arg4, long arg5); +extern long uti_syscall3(long syscall_number, long arg0, long arg1, long arg2); +extern long uti_syscall1(long syscall_number, long arg0); +extern long uti_syscall0(long syscall_number); + diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 8251cf96..738eab75 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -77,6 +77,7 @@ #endif /* !POSTK_DEBUG_ARCH_DEP_77 */ #include "../include/uprotocol.h" #include +#include "../include/uti.h" #include #include "archdep.h" #include "arch_args.h" @@ -187,14 +188,6 @@ struct sigfd { struct sigfd *sigfdtop; - -struct syscall_struct { - int number; - unsigned long args[6]; - unsigned long ret; - unsigned long uti_clv; /* copy of a clv in McKernel */ -}; - #ifdef NCCS #undef NCCS #endif @@ -235,6 +228,7 @@ static struct rlimit rlim_stack; static char *mpol_bind_nodes = NULL; static int uti_thread_rank = 0; static int uti_use_last_cpu = 0; +static int enable_uti = 0; /* Partitioned execution (e.g., for MPI) */ static int nr_processes = 0; @@ -1370,9 +1364,9 @@ static int reduce_stack(struct rlimit *orig_rlim, char *argv[]) void print_usage(char **argv) { #ifdef ADD_ENVS_OPTION - fprintf(stderr, "usage: %s [-c target_core] [-n nr_partitions] [<-e ENV_NAME=value>...] [--mpol-threshold=N] [--enable-straight-map] [--extend-heap-by=N] [-s (--stack-premap=)[premap_size][,max]] [--mpol-no-heap] [--mpol-no-bss] [--mpol-no-stack] [--mpol-shm-premap] [--disable-sched-yield] [--uti-thread-rank=N] [--uti-use-last-cpu] [] (program) [args...]\n", argv[0]); + fprintf(stderr, "usage: %s [-c target_core] [-n nr_partitions] [<-e ENV_NAME=value>...] [--mpol-threshold=N] [--enable-straight-map] [--extend-heap-by=N] [-s (--stack-premap=)[premap_size][,max]] [--mpol-no-heap] [--mpol-no-bss] [--mpol-no-stack] [--mpol-shm-premap] [--disable-sched-yield] [--enable-uti] [--uti-thread-rank=N] [--uti-use-last-cpu] [] (program) [args...]\n", argv[0]); #else /* ADD_ENVS_OPTION */ - fprintf(stderr, "usage: %s [-c target_core] [-n nr_partitions] [--mpol-threshold=N] [--enable-straight-map] [--extend-heap-by=N] [-s (--stack-premap=)[premap_size][,max]] [--mpol-no-heap] [--mpol-no-bss] [--mpol-no-stack] [--mpol-shm-premap] [--disable-sched-yield] [--uti-thread-rank=N] [--uti-use-last-cpu] [] (program) [args...]\n", argv[0]); + fprintf(stderr, "usage: %s [-c target_core] [-n nr_partitions] [--mpol-threshold=N] [--enable-straight-map] [--extend-heap-by=N] [-s (--stack-premap=)[premap_size][,max]] [--mpol-no-heap] [--mpol-no-bss] [--mpol-no-stack] [--mpol-shm-premap] [--disable-sched-yield] [--enable-uti] [--uti-thread-rank=N] [--uti-use-last-cpu] [] (program) [args...]\n", argv[0]); #endif /* ADD_ENVS_OPTION */ } @@ -1801,6 +1795,12 @@ static struct option mcexec_options[] = { .flag = &uti_use_last_cpu, .val = 1, }, + { + .name = "enable-uti", + .has_arg = no_argument, + .flag = &enable_uti, + .val = 1, + }, /* end */ { NULL, 0, NULL, 0, }, }; @@ -1892,7 +1892,7 @@ join_all_threads() do { live_thread = 0; for (tp = thread_data; tp; tp = tp->next) { - if (tp->joined || tp->detached) + if (tp->joined && tp->detached) continue; live_thread = 1; pthread_join(tp->thread_id, NULL); @@ -1933,57 +1933,69 @@ opendev() return fd; } +#define LD_PRELOAD_PREPARE(name) do { \ + sprintf(elembuf, "%s%s/" name, nelem > 0 ? ":" : "", MCKERNEL_LIBDIR); \ + } while (0) + +#define LD_PRELOAD_APPEND do { \ + if (strlen(elembuf) + 1 > remainder) { \ + fprintf(stderr, "%s: warning: LD_PRELOAD line is too long\n", __FUNCTION__); \ + return; \ + } \ + strncat(envbuf, elembuf, remainder); \ + remainder = PATH_MAX - (strlen(envbuf) + 1); \ + nelem++; \ + } while (0) + static void ld_preload_init() { char envbuf[PATH_MAX]; -#ifdef ENABLE_QLMPI - char *old_ld_preload; -#endif + char *ld_preload_str; + size_t remainder = PATH_MAX; + int nelem = 0; + char elembuf[PATH_MAX]; + + memset(envbuf, 0, PATH_MAX); + + if (enable_uti) { + LD_PRELOAD_PREPARE("syscall_intercept.so"); + LD_PRELOAD_APPEND; + } if (disable_sched_yield) { - sprintf(envbuf, "%s/libsched_yield.so.1.0.0", MCKERNEL_LIBDIR); - __dprintf("%s: preload library: %s\n", __FUNCTION__, envbuf); - if (setenv("LD_PRELOAD", envbuf, 1) < 0) { - printf("%s: warning: failed to set LD_PRELOAD for sched_yield\n", - __FUNCTION__); - } + LD_PRELOAD_PREPARE("libsched_yield.so.1.0.0"); + LD_PRELOAD_APPEND; } + +#ifdef ENABLE_QLMPI + LD_PRELOAD_PREPARE("libqlfort.so"); + LD_PRELOAD_APPEND; +#endif + /* Set LD_PRELOAD to McKernel specific value */ - else if (getenv(ld_preload_envname)) { - if (setenv("LD_PRELOAD", getenv(ld_preload_envname), 1) < 0) { + ld_preload_str = getenv(ld_preload_envname); + if (ld_preload_str) { + sprintf(elembuf, "%s%s", nelem > 0 ? ":" : "", ld_preload_str); + LD_PRELOAD_APPEND; + } + + if (strlen(envbuf)) { + if (setenv("LD_PRELOAD", envbuf, 1) < 0) { printf("%s: warning: failed to set LD_PRELOAD environment variable\n", __FUNCTION__); } + __dprintf("%s: preload library: %s\n", __FUNCTION__, envbuf); + } + + if (getenv("ld_preload_envname")) { unsetenv(ld_preload_envname); } - -#ifdef ENABLE_QLMPI - sprintf(envbuf, "%s/libqlfort.so", MCKERNEL_LIBDIR); - if ((old_ld_preload = getenv("LD_PRELOAD"))) { - sprintf(strchr(envbuf, '\0'), " %s", old_ld_preload); - } - setenv("LD_PRELOAD", envbuf, 1); -#endif } -struct uti_desc { - void *wp; - int mck_tid; - unsigned long key; - int pid, tid; /* Used as the id of tracee when issuing MCEXEC_UP_TERMINATE_THREAD */ - unsigned long uti_clv; - sem_t arg, attach; - int exit; /* Used to tell the tracer to exit */ -}; - -static int create_tracer(unsigned long user_start, unsigned long user_end); -int uti_pfd[2]; -struct uti_desc *uti_desc = (void*)-1; -static struct program_load_desc *desc; - int main(int argc, char **argv) { int ret = 0; + struct program_load_desc *desc; int envs_len; char *envs; char *args; @@ -2162,14 +2174,6 @@ int main(int argc, char **argv) if (opendev() == -1) exit(EXIT_FAILURE); -#if 0 - /* TODO: Remove this after memory corruption bug is fixed */ - if ((error = create_tracer(0, 0))) { - fprintf(stderr, "%s: create tracer returned %d\n", __FUNCTION__, error); - return error; - } -#endif - ld_preload_init(); #ifdef ADD_ENVS_OPTION @@ -2693,7 +2697,9 @@ int main(int argc, char **argv) return 1; } +#if 1 /* debug : thread killed by exit_group() are still joinable? */ join_all_threads(); +#endif fn_fail: return ret; } @@ -2853,8 +2859,9 @@ out: return ret; } -static void -kill_thread(unsigned long tid, int sig) +static struct uti_desc *uti_desc; + +static void kill_thread(unsigned long tid, int sig) { struct thread_data_s *tp; @@ -2865,403 +2872,49 @@ kill_thread(unsigned long tid, int sig) if (tp->remote_tid == tid) { if (pthread_kill(tp->thread_id, sig) == ESRCH) { printf("%s: ERROR: Thread not found (tid=%ld,sig=%d)\n", __FUNCTION__, tid, sig); - } - } - } + } + } + } } -static int -samepage(void *a, void *b) -{ - unsigned long aa = (unsigned long)a; - unsigned long bb = (unsigned long)b; - -#ifdef POSTK_DEBUG_ARCH_DEP_35 - return (aa & page_mask) == (bb & page_mask); -#else /* POSTK_DEBUG_ARCH_DEP_35 */ - return (aa & PAGE_MASK) == (bb & PAGE_MASK); -#endif /* POSTK_DEBUG_ARCH_DEP_35 */ -} - -#ifdef DEBUG_UTI -long syscalls[512]; - -static void -debug_sig(int s) +static long util_thread(struct thread_data_s *my_thread, unsigned long uctx_pa, int remote_tid, unsigned long pattr, unsigned long uti_clv, unsigned long _uti_desc) { int i; - for (i = 0; i < 512; i++) - if (syscalls[i]) - fprintf(stderr, "syscall %d called %ld\n", i, - syscalls[i]); -} -#endif - -static int -create_tracer(unsigned long user_start, unsigned long user_end) -{ - int tpid; - int rc; - int st; - int sig = 0; - int i; - struct syscall_struct *param_top = NULL; - struct syscall_struct *param; - unsigned long code = 0; - int exited = 0; - int mode = 0; - unsigned long buf; - struct release_user_space_desc release_desc = { - .user_start = desc->user_start, - .user_end = desc->user_end - }; - - /* Perform mmap() before fork() in create_tracer() */ - uti_desc = mmap(NULL, sizeof(struct uti_desc), PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (uti_desc == (void *)-1) { - return -1; - } - memset(uti_desc, 0, sizeof(struct uti_desc)); - sem_init(&uti_desc->arg, 1, 0); - sem_init(&uti_desc->attach, 1, 0); - uti_desc->wp = mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - if (uti_desc->wp == (void *)-1) { - return -1; - } - - if (pipe(uti_pfd)) { - fprintf(stderr, "%s: pipe failed: %s\n", __FUNCTION__, strerror(errno)); - return -1; - } - - tpid = fork(); - if (tpid) { - if (tpid == -1) - return -1; - close(uti_pfd[1]); - while ((rc = waitpid(tpid, &st, 0)) == -1 && errno == EINTR); - if (rc == -1 || !WIFEXITED(st) || WEXITSTATUS(st)) { - fprintf(stderr, "waitpid rc=%d st=%08x\n", rc, st); - return -ENOMEM; - } -#if 0 - struct timeval tv; - fd_set rfd; - FD_ZERO(&rfd); - FD_SET(uti_pfd[0], &rfd); - tv.tv_sec = 1; - tv.tv_usec = 0; - while ((rc = select(uti_pfd[0] + 1, &rfd, NULL, NULL, &tv)) == -1 && - errno == EINTR); - if (rc == 0) { - fprintf(stderr, "%s: select timed out\n", __FUNCTION__); - close(uti_pfd[0]); - return -ETIMEDOUT; - } - if (rc == -1) { - fprintf(stderr, "%s: select errno=%d\n", __FUNCTION__, errno); - close(uti_pfd[0]); - return -errno; - } -#endif - return 0; - } - close(uti_pfd[0]); -#if 1 /* debug */ - if (ioctl(fd, MCEXEC_UP_RELEASE_USER_SPACE, &release_desc) != 0) { - fprintf(stderr, "%s: ERROR: MCEXEC_UP_RELEASE_USER_SPACE returned %d\n", __FUNCTION__, errno); - exit(1); - } -#endif - tpid = fork(); - if (tpid) { - if (tpid == -1) { - fprintf(stderr, "fork errno=%d\n", errno); - exit(1); - } - exit(0); - } - -#if 0 - /* Reopen device because one process must be managed by one opened-device */ - close(fd); - fd = opendev(); - if (fd < 0) { - fprintf(stderr, "%s: ERROR: opendev returned %d\n", __FUNCTION__, errno); - exit(1); - } - - if (ioctl(fd, MCEXEC_UP_CREATE_PPD) != 0) { - fprintf(stderr, "%s: ERROR: MCEXEC_UP_CREATE_PPD returned %d\n", __FUNCTION__, errno); - exit(1); - } -#endif - -#if 0 - if (ioctl(fd, MCEXEC_UP_RELEASE_USER_SPACE, &release_desc) != 0) { - fprintf(stderr, "%s: ERROR: MCEXEC_UP_RELEASE_USER_SPACE returned %d\n", __FUNCTION__, errno); - exit(1); - } -#endif - sem_wait(&uti_desc->arg); - if (uti_desc->exit) { /* When uti is not used */ - exit(0); - } - - //close(uti_pfd[0]); - - if (ptrace(PTRACE_ATTACH, uti_desc->tid, 0, 0) == -1) { - fprintf(stderr, "PTRACE_ATTACH errno=%d\n", errno); - exit(1); - } - waitpid(-1, &st, __WALL); - if (ptrace(PTRACE_SETOPTIONS, uti_desc->tid, 0, PTRACE_O_TRACESYSGOOD) == -1) { - fprintf(stderr, "PTRACE_SETOPTIONS errno=%d\n", errno); - exit(1); - } - - /* Wake up tracee so that it can context-switch to McKernel code */ - rc = write(uti_pfd[1], &buf, sizeof(unsigned long)); - if (rc != sizeof(unsigned long)) { - fprintf(stderr, "%s: write returned %d\n", __FUNCTION__, rc); - exit(1); - } - close(uti_pfd[1]); - - for (i = 0; i < 4096; i++) - if (i != fd -#ifdef DEBUG_UTI - && i != 2 -#endif - ) - close(i); - open("/dev/null", O_RDONLY); - open("/dev/null", O_WRONLY); -#ifndef DEBUG_UTI - open("/dev/null", O_WRONLY); -#endif - - /* Initialize list of syscall arguments for syscall_intercept */ - if (sizeof(struct syscall_struct) * 11 > PAGE_SIZE) { - fprintf(stderr, "%s: ERROR: param is too large\n", __FUNCTION__); - exit(1); - } - for (i = 1; i <= 10; i++) { - param = (struct syscall_struct *)uti_desc->wp + i; - *(void **)param = param_top; - param_top = param; - } - memset(uti_desc->wp, '\0', sizeof(long)); - -#ifdef DEBUG_UTI - fprintf(stderr, "tracer PID=%d\n", getpid()); - signal(SIGINT, debug_sig); -#endif - for (;;) { - ptrace(PTRACE_SYSCALL, uti_desc->tid, 0, sig); - sig = 0; - waitpid(-1, &st, __WALL); - if (WIFEXITED(st) || WIFSIGNALED(st)) { - struct terminate_thread_desc term_desc; - - term_desc.pid = uti_desc->pid; - term_desc.tid = uti_desc->tid; - term_desc.tsk = uti_desc->key; - code = st; - - if (exited == 2) { /* exit_group */ - code |= 0x0000000100000000; - } - term_desc.sig = code; - - /* How return_syscall() is called depends on how utility thread exits: - exit: - create_tracer() - MCEXEC_UP_TERMINATE_THREAD - return_syscall() - exit_group: - create_tracer() - MCEXEC_UP_TERMINATE_THREAD - return_syscall() - killed by signal: - release_handler() - return_syscall() - */ - if (exited == 1 || exited == 2) { - __dprintf("calling MCEXEC_UP_TERMINATE_THREAD,exited=%d,code=%lx\n", exited, code); - if (ioctl(fd, MCEXEC_UP_TERMINATE_THREAD, &term_desc) != 0) { - fprintf(stderr, "%s: INFO: MCEXEC_UP_TERMINATE_THREAD returned %d\n", __FUNCTION__, errno); - } - } - __dprintf("%s: WIFEXITED=%d,WIFSIGNALED=%d,WTERMSIG=%d,exited=%d\n", __FUNCTION__, WIFEXITED(st), WIFSIGNALED(st), WTERMSIG(st), exited); -#if 0 - if (ptrace(PTRACE_DETACH, uti_desc->tid, 0, WIFSIGNALED(st) ? WTERMSIG(st) : 0) && errno != ESRCH) { - fprintf(stderr, "PTRACE_DETACH errno=%d\n", errno); - exit(1); - } -#endif - break; - } - if (!WIFSTOPPED(st)) { - continue; - } - if (WSTOPSIG(st) & 0x80) { // syscall - syscall_args args; - - get_syscall_args(uti_desc->tid, &args); - -#ifdef DEBUG_UTI - if (get_syscall_return(&args) == -ENOSYS) { - if (get_syscall_number(&args) >= 0 && - get_syscall_number(&args) < 512) { - syscalls[get_syscall_number(&args)]++; - } - } -#endif - - if (get_syscall_number(&args) == __NR_ioctl && - get_syscall_return(&args) == -ENOSYS && - get_syscall_arg1(&args) == fd && - get_syscall_arg2(&args) == MCEXEC_UP_SIG_THREAD) { - mode = get_syscall_arg3(&args); - } - - if (mode) { - continue; - } - - switch (get_syscall_number(&args)) { - case __NR_gettid: - set_syscall_number(&args, -1); - set_syscall_return(&args, uti_desc->mck_tid); - set_syscall_args(uti_desc->tid, &args); - continue; - case __NR_futex: - case __NR_brk: - case __NR_mmap: - case __NR_munmap: - case __NR_mprotect: - case __NR_mremap: - case __NR_msync: - break; - case __NR_exit_group: - exited++; - case __NR_exit: - exited++; - continue; - case __NR_clone: -#ifdef POSTK_DEBUG_ARCH_DEP_78 /* arch dep syscallno hide */ -#ifdef __NR_fork - case __NR_fork: -#endif -#ifdef __NR_vfork - case __NR_vfork: -#endif -#else /* POSTK_DEBUG_ARCH_DEP_78 */ - case __NR_fork: - case __NR_vfork: -#endif /* POSTK_DEBUG_ARCH_DEP_78 */ - case __NR_execve: - set_syscall_number(&args, -1); - set_syscall_args(uti_desc->tid, &args); - continue; -#if 1 /* debug */ - case __NR_set_robust_list: - set_syscall_number(&args, -1); - set_syscall_args(uti_desc->tid, &args); - continue; -#endif - case __NR_ioctl: - param = (struct syscall_struct *) - get_syscall_arg3(&args); - if (get_syscall_return(&args) != -ENOSYS && - get_syscall_arg1(&args) == fd && - get_syscall_arg2(&args) == - MCEXEC_UP_SYSCALL_THREAD && - samepage(uti_desc->wp, param)) { - set_syscall_arg1(&args, param->args[0]); - set_syscall_arg2(&args, param->args[1]); - set_syscall_arg3(&args, param->args[2]); - set_syscall_arg4(&args, param->args[3]); - set_syscall_arg5(&args, param->args[4]); - set_syscall_arg6(&args, param->args[5]); - set_syscall_return(&args, param->ret); - *(void **)param = param_top; - param_top = param; - set_syscall_args(uti_desc->tid, &args); - } - continue; - default: - continue; - } - param = param_top; - if (!param) { - set_syscall_number(&args, -1); - set_syscall_return(&args, -ENOMEM); - } - else { - param_top = *(void **)param; - param->number = get_syscall_number(&args); - param->args[0] = get_syscall_arg1(&args); - param->args[1] = get_syscall_arg2(&args); - param->args[2] = get_syscall_arg3(&args); - param->args[3] = get_syscall_arg4(&args); - param->args[4] = get_syscall_arg5(&args); - param->args[5] = get_syscall_arg6(&args); - param->uti_clv = uti_desc->uti_clv; - param->ret = -EINVAL; - set_syscall_number(&args, __NR_ioctl); - set_syscall_arg1(&args, fd); - set_syscall_arg2(&args, - MCEXEC_UP_SYSCALL_THREAD); - set_syscall_arg3(&args, (unsigned long)param); - } - set_syscall_args(uti_desc->tid, &args); - } - else { // signal - sig = WSTOPSIG(st) & 0x7f; - } - } - -#ifdef DEBUG_UTI - { - char *pmi_str = getenv("PMI_RANK"); - int pmi_rank = pmi_str ? atoi(pmi_str) : -1; - if (pmi_rank == 0 || pmi_rank == -1) { - fprintf(stderr, "offloaded thread called these syscalls\n"); - debug_sig(0); - } - } -#endif - - exit(0); -} - -static long -util_thread(struct thread_data_s *my_thread, unsigned long uctx_pa, int remote_tid, unsigned long pattr, unsigned long uti_clv) -{ void *lctx; void *rctx; void *param[6]; int rc = 0; - unsigned long buf; - -#if 1 - /* Create tracer */ - if ((rc = create_tracer(desc->user_start, desc->user_end))) { - fprintf(stderr, "%s: create_tracer returned %d\n", __FUNCTION__, rc); - goto out; + void *uti_wp = (void*)-1; + + uti_desc = (struct uti_desc *)_uti_desc; + if (!uti_desc) { + fprintf(stderr, "%s: ERROR: uti_desc isn't set. Use mcexec.sh instead of mcexec\n", __FUNCTION__); + exit(1); } -#endif + + /* Initialize uti related variables for syscall_intercept */ + + uti_wp = mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (uti_wp == (void *)-1) { + exit(1); + } + + uti_desc->fd = fd; + + rc = syscall(888); + if (rc != -1) { + fprintf(stderr, "%s: WARNING: syscall_intercept returned %x\n", __FUNCTION__, rc); + } + + /* Get the context of the thread migrating to Linux */ #ifdef POSTK_DEBUG_ARCH_DEP_35 - lctx = (char *)uti_desc->wp + page_size; + lctx = (char *)uti_wp + page_size; rctx = (char *)lctx + page_size; #else - lctx = (char *)uti_desc->wp + PAGE_SIZE; + lctx = (char *)uti_wp + PAGE_SIZE; rctx = (char *)lctx + PAGE_SIZE; -#endif /* POSTK_DEBUG_ARCH_DEP_35 */ +#endif /* POSTK_DEBUG_ARCH_DEP_35 */ param[0] = (void *)uctx_pa; param[1] = rctx; @@ -3277,39 +2930,28 @@ util_thread(struct thread_data_s *my_thread, unsigned long uctx_pa, int remote_t rc = -errno; goto out; } - create_worker_thread(NULL); - /* Pass info to the tracer so that it can masquerade as the tracee */ - uti_desc->wp = uti_desc->wp; + /* Record the info of the thread migrating to Linux */ + uti_desc->wp = uti_wp; uti_desc->mck_tid = remote_tid; - uti_desc->key = (unsigned long)param[3]; + uti_desc->key = (unsigned long)param[3]; /* key to find thread, i.e. struct task_struct * */ uti_desc->pid = getpid(); uti_desc->tid = gettid(); uti_desc->uti_clv = uti_clv; -#if 0 - //usleep(100000); - ssize_t nwritten; - char *cur; - for(cur = (char*)&uti_desc; (nwritten = write(uti_pfd[1], cur, sizeof(struct uti_desc) - (cur - (char*)&uti_desc))) > 0; cur += nwritten) { } - if (nwritten < 0) { - fprintf(stderr, "write returned %ld errno=%d\n", nwritten, errno); - rc = -errno; + /* Initialize list of syscall arguments for syscall_intercept */ + if (sizeof(struct syscall_struct) * 11 > PAGE_SIZE) { + fprintf(stderr, "%s: ERROR: param is too large\n", __FUNCTION__); + rc = -ENOMEM; goto out; } - close(uti_pfd[1]); -#endif - sem_post(&uti_desc->arg); - - /* Wait until tracer attaches me. We can't use - futex because it would be captured and redirected by tracer */ - rc = read(uti_pfd[0], &buf, sizeof(unsigned long)); - if (rc != sizeof(unsigned long)) { - fprintf(stderr, "%s: write returned %d\n", __FUNCTION__, rc); - exit(1); - } - close(uti_pfd[0]); + for (i = 1; i <= 10; i++) { + uti_desc->syscall_param = (struct syscall_struct *)uti_desc->wp + i; + *(void **)uti_desc->syscall_param = uti_desc->syscall_param_top; + uti_desc->syscall_param_top = uti_desc->syscall_param; + } + memset(uti_desc->wp, '\0', sizeof(long)); if (pattr) { struct uti_attr_desc desc; @@ -3325,9 +2967,12 @@ util_thread(struct thread_data_s *my_thread, unsigned long uctx_pa, int remote_t } my_thread->detached = 1; + /* Start intercepting syscalls. Note that it dereferences pointers in uti_desc. */ + uti_desc->start_syscall_intercept = 1; + if ((rc = switch_ctx(fd, MCEXEC_UP_UTIL_THREAD2, param, lctx, rctx)) < 0) { - fprintf(stderr, "util_thread2: %d\n", rc); + fprintf(stderr, "%s: ERROR switch_ctx returned %d\n", __FUNCTION__, rc); } fprintf(stderr, "return from util_thread2 rc=%d\n", rc); pthread_exit(NULL); @@ -3535,7 +3180,6 @@ int main_loop(struct thread_data_s *my_thread) char pathbuf[PATH_MAX]; char tmpbuf[PATH_MAX]; int cpu = my_thread->cpu; - int sem_val; memset(&w, '\0', sizeof w); w.cpu = cpu; @@ -3612,7 +3256,7 @@ int main_loop(struct thread_data_s *my_thread) } else { } - __dprintf("openat: %s\n", pathbuf); + __dprintf("openat: %s,tid=%d\n", pathbuf, my_thread->remote_tid); fn = chgpath(pathbuf, tmpbuf); @@ -3666,6 +3310,7 @@ int main_loop(struct thread_data_s *my_thread) __dprintf("Exit status: %d\n", term); } } + } #ifdef USE_SYSCALL_MOD_CALL @@ -3685,18 +3330,7 @@ int main_loop(struct thread_data_s *my_thread) pause(); } - /* Make tracer exit when it is not used */ - if (uti_desc != (void*)-1) { - if (sem_getvalue(&uti_desc->arg, &sem_val)) { - fprintf(stderr, "%s: ERROR: sem_getvalue returned %d\n", __FUNCTION__, errno); - } - if (sem_val == 0) { - uti_desc->exit = 1; - sem_post(&uti_desc->arg); - } - } - - exit(term); + exit(term); /* Call release_handler() and proceed terminate() */ //pthread_mutex_unlock(lock); return w.sr.args[0]; @@ -3834,15 +3468,6 @@ gettid_out: goto fork_child_sync_pipe; } -#if 1 - /* Create tracer */ - if ((ret = create_tracer(desc->user_start, desc->user_end))) { - fs->status = ret; - fprintf(stderr, "%s: create tracer returned %d\n", __FUNCTION__, ret); - goto fork_child_sync_pipe; - } -#endif - if (ioctl(fd, MCEXEC_UP_CREATE_PPD) != 0) { fs->status = -errno; fprintf(stderr, "ERROR: creating PPD %s\n", dev); @@ -3912,8 +3537,9 @@ fork_child_sync_pipe: } munmap(fs, sizeof(struct fork_sync)); +#if 1 /* debug : thread killed by exit_group() are still joinable? */ join_all_threads(); - +#endif return ret; } @@ -4430,7 +4056,7 @@ return_execve2: case __NR_sched_setaffinity: if (w.sr.args[0] == 0) { ret = util_thread(my_thread, w.sr.args[1], w.sr.rtid, - w.sr.args[2], w.sr.args[3]); + w.sr.args[2], w.sr.args[3], w.sr.args[4]); } else { ret = munmap((void *)w.sr.args[1], diff --git a/executer/user/syscall_intercept.c b/executer/user/syscall_intercept.c new file mode 100644 index 00000000..e97cdf6e --- /dev/null +++ b/executer/user/syscall_intercept.c @@ -0,0 +1,128 @@ +#include +#include +#include +#include +#include +#include +#include +#include "../include/uprotocol.h" +#include "../include/uti.h" +#include "./archdep_uti.h" + +static struct uti_desc uti_desc; + +#define DEBUG_UTI + +static int +hook(long syscall_number, + long arg0, long arg1, + long arg2, long arg3, + long arg4, long arg5, + long *result) +{ + //return 1; /* debug */ + int tid = uti_syscall0(__NR_gettid); + struct terminate_thread_desc term_desc; + unsigned long sig; + + if (!uti_desc.start_syscall_intercept) { + return 1; /* System call isn't taken over */ + } + if (tid != uti_desc.mck_tid) { + if (uti_desc.syscalls2 && syscall_number >= 0 && syscall_number < 512) { + uti_desc.syscalls2[syscall_number]++; + } + return 1; + } +#ifdef DEBUG_UTI + if (uti_desc.syscalls && syscall_number >= 0 && syscall_number < 512) { + uti_desc.syscalls[syscall_number]++; + } +#endif + + switch (syscall_number) { + case __NR_gettid: + *result = uti_desc.mck_tid; + return 0; + case __NR_futex: + case __NR_brk: + case __NR_mmap: + case __NR_munmap: + case __NR_mprotect: + case __NR_mremap: + if (!uti_desc.syscall_param_top) { + *result = -ENOMEM; + return 0; + } + else { + /* Pop syscall_struct list for reentrant safety */ + uti_desc.syscall_param = uti_desc.syscall_param_top; + uti_desc.syscall_param_top = *(void **)uti_desc.syscall_param; + + uti_desc.syscall_param->number = syscall_number; + uti_desc.syscall_param->args[0] = arg0; + uti_desc.syscall_param->args[1] = arg1; + uti_desc.syscall_param->args[2] = arg2; + uti_desc.syscall_param->args[3] = arg3; + uti_desc.syscall_param->args[4] = arg4; + uti_desc.syscall_param->args[5] = arg5; + uti_desc.syscall_param->uti_clv = uti_desc.uti_clv; + uti_desc.syscall_param->ret = -EINVAL; + uti_syscall3(__NR_ioctl, uti_desc.fd, MCEXEC_UP_SYSCALL_THREAD, (long)uti_desc.syscall_param); + *result = uti_desc.syscall_param->ret; + + /* push syscall_struct list */ + *(void **)uti_desc.syscall_param = uti_desc.syscall_param_top; + uti_desc.syscall_param_top = uti_desc.syscall_param; + + return 0; /* System call is taken over */ + } + break; + case __NR_exit_group: + sig = 0x100000000; + goto make_remote_thread_exit; + case __NR_exit: + sig = 0; + make_remote_thread_exit: + /* Make migrated-to-Linux thread on the McKernel side call do_exit() or terminate() */ + term_desc.pid = uti_desc.pid; + term_desc.tid = uti_desc.tid; /* tid of mcexec */ + term_desc.sig = sig | (arg0 << 8); + term_desc.tsk = uti_desc.key; + + uti_syscall3(__NR_ioctl, uti_desc.fd, MCEXEC_UP_TERMINATE_THREAD, (long)&term_desc); + return 1; + case __NR_clone: + case __NR_fork: + case __NR_vfork: + case __NR_execve: + *result = -ENOSYS; + return 0; +#if 0 /* debug */ + case __NR_set_robust_list: + *result = -ENOSYS; + return 0; +#endif + case 888: + *result = (long)&uti_desc; + return 0; + default: + return 1; + } + + return 0; +} + +static __attribute__((constructor)) void +init(void) +{ + // Set up the callback function + intercept_hook_point = hook; + + uti_syscall1(733, (unsigned long)&uti_desc); +} + +static __attribute__((destructor)) void +dtor(void) +{ +} diff --git a/kernel/syscall.c b/kernel/syscall.c index b5589c4c..c01f6222 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -60,6 +60,7 @@ #include #include #include +#include "../executer/include/uti.h" /* Headers taken from kitten LWK */ #include @@ -74,7 +75,7 @@ #define DDEBUG_DEFAULT DDEBUG_PRINT #endif -#define DEBUG_UTI +//#define DEBUG_UTI #ifdef DEBUG_UTI #define uti_dkprintf(...) do { ((uti_clv && linux_printk) ? (*linux_printk) : kprintf)(__VA_ARGS__); } while (0) #else @@ -112,6 +113,7 @@ char *syscall_name[] MCKERNEL_UNUSED = { }; static ihk_spinlock_t tod_data_lock = SPIN_LOCK_UNLOCKED; +static unsigned long uti_desc; /* Address of struct uti_desc object in syscall_intercept.c */ static void calculate_time_from_tsc(struct timespec *ts); void check_signal(unsigned long, void *, int); @@ -3131,6 +3133,8 @@ SYSCALL_DECLARE(setpgid) return rc; } +/* Ignore the registration by start_thread() (in pthread_create.c) + because McKernel doesn't unlock mutex-es held by the thread which has been killed. */ SYSCALL_DECLARE(set_robust_list) { // Palliative fix. wait for impl. @@ -9085,6 +9089,29 @@ SYSCALL_DECLARE(pmc_reset) extern void save_uctx(void *, void *); +/* TODO: use copy_from_user() */ +int util_show_syscall_profile() +{ + int i; + struct uti_desc *desc = (struct uti_desc *)uti_desc; + + kprintf("Syscall stats for offloaded thread:\n"); + for (i = 0; i < 512; i++) { + if (desc->syscalls[i]) { + kprintf("nr=%d #called=%ld\n", i, desc->syscalls[i]); + } + } + + kprintf("Syscall stats for other threads:\n"); + for (i = 0; i < 512; i++) { + if (desc->syscalls2[i]) { + kprintf("nr=%d #called=%ld\n", i, desc->syscalls2[i]); + } + } + + return 0; +} + int util_thread(struct uti_attr *arg) { @@ -9127,8 +9154,13 @@ util_thread(struct uti_attr *arg) request.args[2] = virt_to_phys(&kattr); } request.args[3] = (unsigned long)uti_clv; + request.args[4] = uti_desc; thread->thread_offloaded = 1; rc = do_syscall(&request, ihk_mc_get_processor_id(), 0); + dkprintf("%s: returned from do_syscall,tid=%d,rc=%lx\n", __FUNCTION__, thread->tid, rc); + + util_show_syscall_profile(); + thread->thread_offloaded = 0; free_address = context[0]; free_size = context[1]; @@ -9141,8 +9173,8 @@ util_thread(struct uti_attr *arg) thread->proc->nohost = 1; terminate((rc >> 8) & 255, rc & 255); } else { - /* tracer has detected exit or killed by signal */ - dkprintf("%s: exit, pid=%d,tid=%d,rc=%lx\n", __FUNCTION__, thread->proc->pid, thread->tid, rc); + /* exit or killed-by-signal detected */ + dkprintf("%s: exit or killed by signal, pid=%d,tid=%d,rc=%lx\n", __FUNCTION__, thread->proc->pid, thread->tid, rc); request.number = __NR_sched_setaffinity; request.args[0] = 1; request.args[1] = free_address; @@ -9316,6 +9348,14 @@ SYSCALL_DECLARE(resume_threads) return 0; } +SYSCALL_DECLARE(util_register_desc) +{ + struct thread *thread = cpu_local_var(current); + uti_desc = ihk_mc_syscall_arg0(ctx); + dkprintf("%s: tid=%d,uti_desc=%lx\n", __FUNCTION__, thread->tid, uti_desc); + return 0; +} + void reset_cputime() { @@ -9518,7 +9558,7 @@ long syscall(int num, ihk_mc_user_context_t *ctx) && (syscall_table[num] != NULL)) { l = syscall_table[num](num, ctx); - dkprintf("SC(%d)[%3d] ret: %d\n", + dkprintf("SC(%d)[%3d] ret: %lx\n", ihk_mc_get_processor_id(), num, l); } else { dkprintf("USC[%3d](%lx, %lx, %lx, %lx, %lx) @ %lx | %lx\n", num,