From 07efb3ab9a08e704741db687badcb22064116ee7 Mon Sep 17 00:00:00 2001 From: Tomoki Shirasawa Date: Tue, 27 Jun 2017 13:27:09 +0900 Subject: [PATCH] support to utility thread offloading --- arch/x86/kernel/include/syscall_list.h | 3 + arch/x86/kernel/interrupt.S | 1 + arch/x86/kernel/syscall.c | 69 +- configure | 3 +- configure.ac | 1 + executer/include/uprotocol.h | 10 + executer/kernel/mcctrl/arch/x86_64/archdeps.c | 62 ++ executer/kernel/mcctrl/control.c | 653 ++++++++++++++++- executer/kernel/mcctrl/driver.c | 8 + executer/kernel/mcctrl/syscall.c | 180 ++++- executer/user/Makefile.in | 19 +- executer/user/arch/x86_64/arch_args.h | 113 +++ executer/user/arch/x86_64/archdep.S | 149 ++++ executer/user/archdep.h | 3 + executer/user/mcexec.c | 658 +++++++++++++++--- kernel/include/process.h | 9 + kernel/include/syscall.h | 30 + kernel/process.c | 9 +- kernel/syscall.c | 231 +++++- 19 files changed, 2077 insertions(+), 134 deletions(-) create mode 100644 executer/user/arch/x86_64/arch_args.h create mode 100644 executer/user/arch/x86_64/archdep.S create mode 100644 executer/user/archdep.h diff --git a/arch/x86/kernel/include/syscall_list.h b/arch/x86/kernel/include/syscall_list.h index e868ee10..19affc00 100644 --- a/arch/x86/kernel/include/syscall_list.h +++ b/arch/x86/kernel/include/syscall_list.h @@ -153,5 +153,8 @@ SYSCALL_HANDLED(700, get_cpu_id) #ifdef PROFILE_ENABLE SYSCALL_HANDLED(__NR_profile, profile) #endif // PROFILE_ENABLE +SYSCALL_HANDLED(730, util_migrate_inter_kernel) +SYSCALL_HANDLED(731, util_indicate_clone) +SYSCALL_HANDLED(732, get_system) /**** End of File ****/ diff --git a/arch/x86/kernel/interrupt.S b/arch/x86/kernel/interrupt.S index d6765e80..60f280e1 100644 --- a/arch/x86/kernel/interrupt.S +++ b/arch/x86/kernel/interrupt.S @@ -239,6 +239,7 @@ enter_user_mode: movq $0, %rdi movq %rsp, %rsi call check_signal + call utilthr_migrate movq $0, %rdi call set_cputime POP_ALL_REGS diff --git a/arch/x86/kernel/syscall.c b/arch/x86/kernel/syscall.c index e4a42647..2260b665 100644 --- a/arch/x86/kernel/syscall.c +++ b/arch/x86/kernel/syscall.c @@ -259,7 +259,7 @@ SYSCALL_DECLARE(rt_sigreturn) extern struct cpu_local_var *clv; extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont); -extern void interrupt_syscall(int pid, int tid); +extern void interrupt_syscall(struct thread *, int sig); extern int num_processors; #define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \ @@ -1230,6 +1230,12 @@ done: return 0; } + if (tthread->thread_offloaded) { + interrupt_syscall(tthread, sig); + release_thread(tthread); + return 0; + } + doint = 0; mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node); @@ -1275,8 +1281,6 @@ done: cpu_restore_interrupt(irqstate); if (doint && !(mask & tthread->sigmask.__val[0])) { - int tid = tthread->tid; - int pid = tproc->pid; int status = tthread->status; if (thread != tthread) { @@ -1286,7 +1290,7 @@ done: } if(!tthread->proc->nohost) - interrupt_syscall(pid, tid); + interrupt_syscall(tthread, 0); if (status != PS_RUNNING) { if(sig == SIGKILL){ @@ -1826,4 +1830,61 @@ out: return error; } /* arch_map_vdso() */ +void +save_uctx(void *uctx, struct x86_user_context *regs) +{ + struct trans_uctx { + volatile int cond; + int fregsize; + + unsigned long rax; + unsigned long rbx; + unsigned long rcx; + unsigned long rdx; + unsigned long rsi; + unsigned long rdi; + unsigned long rbp; + unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + unsigned long rflags; + unsigned long rip; + unsigned long rsp; + unsigned long fs; + } *ctx = uctx; + + if (!regs) { + asm ("movq %%gs:(%1),%0" : "=r"(regs) : + "r"(offsetof(struct x86_cpu_local_variables, tss.rsp0))); + regs--; + } + + ctx->cond = 0; + ctx->rax = regs->gpr.rax; + ctx->rbx = regs->gpr.rbx; + ctx->rcx = regs->gpr.rcx; + ctx->rdx = regs->gpr.rdx; + ctx->rsi = regs->gpr.rsi; + ctx->rdi = regs->gpr.rdi; + ctx->rbp = regs->gpr.rbp; + ctx->r8 = regs->gpr.r8; + ctx->r9 = regs->gpr.r9; + ctx->r10 = regs->gpr.r10; + ctx->r11 = regs->gpr.r11; + ctx->r12 = regs->gpr.r12; + ctx->r13 = regs->gpr.r13; + ctx->r14 = regs->gpr.r14; + ctx->r15 = regs->gpr.r15; + ctx->rflags = regs->gpr.rflags; + ctx->rsp = regs->gpr.rsp; + ctx->rip = regs->gpr.rip; + ihk_mc_arch_get_special_register(IHK_ASR_X86_FS, &ctx->fs); + ctx->fregsize = 0; +} + /*** End of File ***/ diff --git a/configure b/configure index 114db8b8..d66ff111 100755 --- a/configure +++ b/configure @@ -4583,7 +4583,7 @@ fi ac_config_headers="$ac_config_headers config.h" -ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/eclair-dump-backtrace.exp arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in" +ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/eclair-dump-backtrace.exp arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in" if test "x$enable_dcfa" = xyes; then : @@ -5284,6 +5284,7 @@ do "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; "executer/user/Makefile") CONFIG_FILES="$CONFIG_FILES executer/user/Makefile" ;; + "executer/user/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/user/arch/x86_64/Makefile" ;; "executer/kernel/mcctrl/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/Makefile" ;; "executer/kernel/mcctrl/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/arch/x86_64/Makefile" ;; "executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;; diff --git a/configure.ac b/configure.ac index 842a371d..dde5f50b 100644 --- a/configure.ac +++ b/configure.ac @@ -356,6 +356,7 @@ AC_CONFIG_HEADERS([config.h]) AC_CONFIG_FILES([ Makefile executer/user/Makefile + executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 382e1220..b88e4750 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -55,6 +55,16 @@ #define MCEXEC_UP_SYS_UMOUNT 0x30a02915 #define MCEXEC_UP_SYS_UNSHARE 0x30a02916 +#define MCEXEC_UP_UTIL_THREAD1 0x30a02920 +#define MCEXEC_UP_UTIL_THREAD2 0x30a02921 +#define MCEXEC_UP_SIG_THREAD 0x30a02922 +#define MCEXEC_UP_SWITCH_THREAD 0x30a02923 +#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924 +#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925 + +#define MCEXEC_UP_COPY_FROM_MCK 0x30a03000 +#define MCEXEC_UP_COPY_TO_MCK 0x30a03001 + #define MCEXEC_UP_DEBUG_LOG 0x40000000 #define MCEXEC_UP_TRANSFER_TO_REMOTE 0 diff --git a/executer/kernel/mcctrl/arch/x86_64/archdeps.c b/executer/kernel/mcctrl/arch/x86_64/archdeps.c index 81090411..caec9c15 100644 --- a/executer/kernel/mcctrl/arch/x86_64/archdeps.c +++ b/executer/kernel/mcctrl/arch/x86_64/archdeps.c @@ -196,3 +196,65 @@ out: ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso)); return; } /* get_vdso_info() */ + +void * +get_user_sp(void) +{ + unsigned long usp; + + asm volatile("movq %%gs:0xaf80, %0" : "=r" (usp)); + return (void *)usp; +} + +void +set_user_sp(void *usp) +{ + asm volatile("movq %0, %%gs:0xaf80" :: "r" (usp)); +} + +struct trans_uctx { + volatile int cond; + int fregsize; + + unsigned long rax; + unsigned long rbx; + unsigned long rcx; + unsigned long rdx; + unsigned long rsi; + unsigned long rdi; + unsigned long rbp; + unsigned long r8; + unsigned long r9; + unsigned long r10; + unsigned long r11; + unsigned long r12; + unsigned long r13; + unsigned long r14; + unsigned long r15; + unsigned long rflags; + unsigned long rip; + unsigned long rsp; + unsigned long fs; +}; + +void +restore_fs(unsigned long fs) +{ + wrmsrl(MSR_FS_BASE, fs); +} + +void +save_fs_ctx(void *ctx) +{ + struct trans_uctx *tctx = ctx; + + rdmsrl(MSR_FS_BASE, tctx->fs); +} + +unsigned long +get_fs_ctx(void *ctx) +{ + struct trans_uctx *tctx = ctx; + + return tctx->fs; +} diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index 1132f1f2..569f1d62 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -38,6 +38,9 @@ #include #include #include +#include +#include +#include #include "../../../config.h" #include "mcctrl.h" #include @@ -85,6 +88,10 @@ int (*mcctrl_sys_umount)(char *dir_name, int flags) = sys_umount; //extern struct mcctrl_channel *channels; int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu); +int syscall_backward(struct mcctrl_usrdata *, int, unsigned long, unsigned long, + unsigned long, unsigned long, unsigned long, + unsigned long, unsigned long *); +long mcexec_switch_thread(ihk_os_t os, unsigned long code, struct file *file); static long mcexec_prepare_image(ihk_os_t os, struct program_load_desc * __user udesc) @@ -305,13 +312,38 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt) #endif } -//extern unsigned long last_thread_exec; - -struct release_handler_info { +struct mcos_handler_info { int pid; int cpu; + struct mcctrl_usrdata *ud; + struct file *file; }; +struct mcos_handler_info; +static struct host_thread *host_threads; +DEFINE_RWLOCK(host_thread_lock); + +struct host_thread { + struct host_thread *next; + struct mcos_handler_info *handler; + int pid; + int tid; + unsigned long usp; + unsigned long lfs; + unsigned long rfs; +}; + +struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file) +{ + struct mcos_handler_info *info; + + info = kmalloc(sizeof(struct mcos_handler_info), GFP_KERNEL); + memset(info, '\0', sizeof(struct mcos_handler_info)); + info->ud = ihk_host_os_get_usrdata(os); + info->file = file; + return info; +} + static long mcexec_debug_log(ihk_os_t os, unsigned long arg) { struct ikc_scd_packet isp; @@ -326,11 +358,43 @@ static long mcexec_debug_log(ihk_os_t os, unsigned long arg) int mcexec_close_exec(ihk_os_t os); int mcexec_destroy_per_process_data(ihk_os_t os); +#if 0 +static unsigned long mod_sys_call_table(int num, unsigned long func) +{ + static unsigned long *sys_call_table = NULL; + unsigned long oldval; + + if (!sys_call_table) { + sys_call_table = + (unsigned long *)kallsyms_lookup_name("sys_call_table"); + if (!sys_call_table) { + printk("sys_call_table not found\n"); + return -ENOENT; + } + } + oldval = sys_call_table[num]; + if (func && sys_call_table[num] != func) { + sys_call_table[num] = func; + } + return oldval; +} +#endif + static void release_handler(ihk_os_t os, void *param) { - struct release_handler_info *info = param; + struct mcos_handler_info *info = param; struct ikc_scd_packet isp; int os_ind = ihk_host_os_get_index(os); + unsigned long flags; + struct host_thread *thread; + + write_lock_irqsave(&host_thread_lock, flags); + for (thread = host_threads; thread; thread = thread->next) { + if (thread->handler == info) { + thread->handler = NULL; + } + } + write_unlock_irqrestore(&host_thread_lock, flags); mcexec_close_exec(os); @@ -356,14 +420,15 @@ static long mcexec_newprocess(ihk_os_t os, struct file *file) { struct newprocess_desc desc; - struct release_handler_info *info; + struct mcos_handler_info *info; if (copy_from_user(&desc, udesc, sizeof(struct newprocess_desc))) { return -EFAULT; } - info = kmalloc(sizeof(struct release_handler_info), GFP_KERNEL); + info = new_mcos_handler_info(os, file); info->pid = desc.pid; ihk_os_register_release_handler(file, release_handler, info); + ihk_os_set_mcos_private_data(file, info); return 0; } @@ -375,7 +440,7 @@ static long mcexec_start_image(ihk_os_t os, struct ikc_scd_packet isp; struct mcctrl_channel *c; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - struct release_handler_info *info; + struct mcos_handler_info *info; desc = kmalloc(sizeof(*desc), GFP_KERNEL); if (!desc) { @@ -390,10 +455,11 @@ static long mcexec_start_image(ihk_os_t os, return -EFAULT; } - info = kmalloc(sizeof(struct release_handler_info), GFP_KERNEL); + info = new_mcos_handler_info(os, file); info->pid = desc->pid; info->cpu = desc->cpu; ihk_os_register_release_handler(file, release_handler, info); + ihk_os_set_mcos_private_data(file, info); c = usrdata->channels + desc->cpu; @@ -937,7 +1003,6 @@ void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd) for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; i++) { struct mcctrl_per_thread_data *ptd; struct mcctrl_per_thread_data *next; - struct ikc_scd_packet *packet; list_for_each_entry_safe(ptd, next, ppd->per_thread_data_hash + i, hash) { @@ -1207,6 +1272,7 @@ retry_alloc: ret = -EINVAL;; goto put_ppd_out; } + req->cpu = packet->ref; ret = 0; goto put_ppd_out; @@ -2015,6 +2081,551 @@ void mcctrl_perf_ack(ihk_os_t os, struct ikc_scd_packet *packet) } +extern void *get_user_sp(void); +extern void set_user_sp(unsigned long); +extern void restore_fs(unsigned long fs); +extern void save_fs_ctx(void *); +extern unsigned long get_fs_ctx(void *); + +long +mcexec_util_thread1(ihk_os_t os, unsigned long arg, struct file *file) +{ + void **__user uparam = (void ** __user)arg; + void *param[6]; + unsigned long p_rctx; + unsigned long phys; + void *__user u_rctx; + void *rctx; + int rc = 0; + unsigned long free_address; + unsigned long free_size; + unsigned long icurrent = (unsigned long)current; + + if(copy_from_user(param, uparam, sizeof(void *) * 6)) { + return -EFAULT; + } + p_rctx = (unsigned long)param[0]; + u_rctx = (void *__user)param[1]; + free_address = (unsigned long)param[4]; + free_size = (unsigned long)param[5]; + + phys = ihk_device_map_memory(ihk_os_to_dev(os), p_rctx, PAGE_SIZE); +#ifdef CONFIG_MIC + rctx = ioremap_wc(phys, PAGE_SIZE); +#else + rctx = ihk_device_map_virtual(ihk_os_to_dev(os), phys, PAGE_SIZE, NULL, 0); +#endif + if(copy_to_user(u_rctx, rctx, PAGE_SIZE) || + copy_to_user((unsigned long *)(uparam + 3), &icurrent, + sizeof(unsigned long))) + rc = -EFAULT; + + ((unsigned long *)rctx)[0] = free_address; + ((unsigned long *)rctx)[1] = free_size; + +#ifdef CONFIG_MIC + iounmap(rctx); +#else + ihk_device_unmap_virtual(ihk_os_to_dev(os), rctx, PAGE_SIZE); +#endif + ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE); + + return rc; +} + +#if 0 +static struct { + unsigned long org_futex; + unsigned long org_brk; + unsigned long org_clone; + unsigned long org_fork; + unsigned long org_vfork; + unsigned long org_gettid; + unsigned long org_mmap; + unsigned long org_munmap; + unsigned long org_mprotect; + unsigned long org_mremap; + unsigned long org_execve; + unsigned long org_exit_group; + unsigned long org_exit; +} org_syscalls; +#endif + +static inline struct host_thread *get_host_thread(void) +{ + int pid = task_tgid_vnr(current); + int tid = task_pid_vnr(current); + unsigned long flags; + struct host_thread *thread; + + read_lock_irqsave(&host_thread_lock, flags); + for (thread = host_threads; thread; thread = thread->next) + if(thread->pid == pid && thread->tid == tid) + break; + read_unlock_irqrestore(&host_thread_lock, flags); + + return thread; +} + +#if 0 +#define DEF_SYSCALL(f, v, n) \ +static asmlinkage unsigned long f(unsigned long p1, unsigned long p2, \ + unsigned long p3, unsigned long p4, unsigned long p5, \ + unsigned long p6)\ +{\ + struct host_thread *thread = get_host_thread();\ +\ + if (thread) {\ + unsigned long ret;\ + int rc;\ +\ + rc = syscall_backward(thread->handler->ud, n, p1, p2, p3, p4, \ + p5, p6, &ret);\ + if (rc < 0)\ + return rc;\ + return ret;\ + }\ +\ + return ((asmlinkage unsigned long (*)(unsigned long, unsigned long,\ + unsigned long, unsigned long, unsigned long, unsigned long))\ + org_syscalls.v)(p1, p2, p3, p4, p5, p6);\ +} + +#define BAD_SYSCALL(f, v) \ +static asmlinkage unsigned long f(unsigned long p1, unsigned long p2, \ + unsigned long p3, unsigned long p4, unsigned long p5, \ + unsigned long p6)\ +{\ + struct host_thread *thread = get_host_thread();\ +\ + if (thread) {\ + return -ENOSYS;\ + }\ +\ + return ((asmlinkage unsigned long (*)(unsigned long, unsigned long,\ + unsigned long, unsigned long, unsigned long, unsigned long))\ + org_syscalls.v)(p1, p2, p3, p4, p5, p6);\ +} + +DEF_SYSCALL(mod_futex, org_futex, __NR_futex) +DEF_SYSCALL(mod_brk, org_brk, __NR_brk) +DEF_SYSCALL(mod_gettid, org_gettid, __NR_gettid) +DEF_SYSCALL(mod_mmap, org_mmap, __NR_mmap) +DEF_SYSCALL(mod_munmap, org_munmap, __NR_munmap) +DEF_SYSCALL(mod_mremap, org_mremap, __NR_mremap) +DEF_SYSCALL(mod_mprotect, org_mprotect, __NR_mprotect) +BAD_SYSCALL(mod_clone, org_clone) +BAD_SYSCALL(mod_fork, org_fork) +BAD_SYSCALL(mod_vfork, org_vfork) +BAD_SYSCALL(mod_execve, org_execve) + +static asmlinkage unsigned long mod_exit(int exit_status) +{ + struct host_thread *thread = get_host_thread(); + + if (thread) { + unsigned long code = (exit_status & 255) << 8; + ihk_os_t os = thread->handler->ud->os; + struct file *file = thread->handler->file; + + mcexec_switch_thread(os, code, file); + return 0; + } + + return ((asmlinkage unsigned long (*)(int)) + org_syscalls.org_exit)(exit_status); +} + +static asmlinkage unsigned long mod_exit_group(int exit_status) +{ + struct host_thread *thread = get_host_thread(); + + if (thread) { + unsigned long code = (exit_status & 255) << 8; + ihk_os_t os = thread->handler->ud->os; + struct file *file = thread->handler->file; + + code |= 0x100000000; + mcexec_switch_thread(os, code, file); + return 0; + } + + return ((asmlinkage unsigned long (*)(int)) + org_syscalls.org_exit_group)(exit_status); +} + +static void save_syscalls(void) +{ +#define SAVE_SYSCALL(v, f, n) \ +do { \ + unsigned long org; \ + if (org_syscalls.v == 0L && \ + (org = mod_sys_call_table(n, 0L)) != (unsigned long)f) \ + org_syscalls.v = org; \ +} while (0) + + SAVE_SYSCALL(org_futex, mod_futex, __NR_futex); + SAVE_SYSCALL(org_brk, mod_brk, __NR_brk); + SAVE_SYSCALL(org_clone, mod_clone, __NR_clone); + SAVE_SYSCALL(org_fork, mod_fork, __NR_fork); + SAVE_SYSCALL(org_vfork, mod_vfork, __NR_vfork); + SAVE_SYSCALL(org_gettid, mod_gettid, __NR_gettid); + SAVE_SYSCALL(org_mmap, mod_mmap, __NR_mmap); + SAVE_SYSCALL(org_munmap, mod_munmap, __NR_munmap); + SAVE_SYSCALL(org_mprotect, mod_mprotect, __NR_mprotect); + SAVE_SYSCALL(org_mremap, mod_mremap, __NR_mremap); + SAVE_SYSCALL(org_execve, mod_execve, __NR_execve); + SAVE_SYSCALL(org_exit_group, mod_exit_group, __NR_exit_group); + SAVE_SYSCALL(org_exit, mod_exit, __NR_exit); +} + +static void mod_syscalls(void) +{ +#define MOD_SYSCALL(f, n) \ +do { \ + mod_sys_call_table(n, (unsigned long)f); \ +} while (0) + + MOD_SYSCALL(mod_futex, __NR_futex); + MOD_SYSCALL(mod_brk, __NR_brk); + MOD_SYSCALL(mod_clone, __NR_clone); + MOD_SYSCALL(mod_fork, __NR_fork); + MOD_SYSCALL(mod_vfork, __NR_vfork); + MOD_SYSCALL(mod_gettid, __NR_gettid); + MOD_SYSCALL(mod_mmap, __NR_mmap); + MOD_SYSCALL(mod_munmap, __NR_munmap); + MOD_SYSCALL(mod_mprotect, __NR_mprotect); + MOD_SYSCALL(mod_mremap, __NR_mremap); + MOD_SYSCALL(mod_execve, __NR_execve); + MOD_SYSCALL(mod_exit_group, __NR_exit_group); + MOD_SYSCALL(mod_exit, __NR_exit); +} + +static void restore_syscalls(void) +{ +#define RESTORE_SYSCALL(v, n) \ +do { \ + mod_sys_call_table(n, org_syscalls.v); \ +} while (0) + + RESTORE_SYSCALL(org_futex, __NR_futex); + RESTORE_SYSCALL(org_brk, __NR_brk); + RESTORE_SYSCALL(org_clone, __NR_clone); + RESTORE_SYSCALL(org_fork, __NR_fork); + RESTORE_SYSCALL(org_vfork, __NR_vfork); + RESTORE_SYSCALL(org_gettid, __NR_gettid); + RESTORE_SYSCALL(org_mmap, __NR_mmap); + RESTORE_SYSCALL(org_munmap, __NR_munmap); + RESTORE_SYSCALL(org_mprotect, __NR_mprotect); + RESTORE_SYSCALL(org_mremap, __NR_mremap); + RESTORE_SYSCALL(org_execve, __NR_execve); + RESTORE_SYSCALL(org_exit_group, __NR_exit_group); + RESTORE_SYSCALL(org_exit, __NR_exit); +} + +static void process_exit_prober(void *data, struct task_struct *tsk) +{ + struct mcos_handler_info *info; + unsigned long flags; + struct host_thread *thread; + struct host_thread *prev; + int pid = task_tgid_vnr(tsk); + int tid = task_pid_vnr(tsk); + int code; + struct ikc_scd_packet *packet; + struct mcctrl_usrdata *usrdata = NULL; + struct mcctrl_per_proc_data *ppd = NULL; + + if (!host_threads) { + return; + } + write_lock_irqsave(&host_thread_lock, flags); + for (prev = NULL, thread = host_threads; thread; + prev = thread, thread = thread->next) + if(thread->pid == pid && thread->tid == tid) + break; + if (!thread) { + write_unlock_irqrestore(&host_thread_lock, flags); + return; + } + info = thread->handler; + if (!info) + goto err; + + usrdata = info->ud; + code = tsk->exit_code; + ppd = mcctrl_get_per_proc_data(usrdata, pid); + if (!ppd) { + kprintf("%s: ERROR: no packet registered for TID %d\n", + __FUNCTION__, task_pid_vnr(current)); + goto err; + } + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, tsk); + if (!packet) { + goto err; + } + mcctrl_delete_per_thread_data(ppd, tsk); + __return_syscall(usrdata->os, packet, code, tid); + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); +err: + if (ppd) + mcctrl_put_per_proc_data(ppd); + if (prev) + prev->next = thread->next; + else + host_threads = thread->next; + write_unlock_irqrestore(&host_thread_lock, flags); + kfree(thread); + read_lock_irqsave(&host_thread_lock, flags); + if (!host_threads) { + restore_syscalls(); + unregister_trace_sched_process_exit(process_exit_prober, NULL); + } + read_unlock_irqrestore(&host_thread_lock, flags); +} +#endif + +long +mcexec_util_thread2(ihk_os_t os, unsigned long arg, struct file *file) +{ + void *usp = get_user_sp(); + struct mcos_handler_info *info; + struct host_thread *thread; + unsigned long flags; + void **__user param = (void **__user )arg; + void *__user rctx = (void *__user)param[1]; + void *__user lctx = (void *__user)param[2]; + + save_fs_ctx(lctx); + info = ihk_os_get_mcos_private_data(file); + thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL); + memset(thread, '\0', sizeof(struct host_thread)); + thread->pid = task_tgid_vnr(current); + thread->tid = task_pid_vnr(current); + thread->usp = (unsigned long)usp; + thread->lfs = get_fs_ctx(lctx); + thread->rfs = get_fs_ctx(rctx); + thread->handler = info; + + write_lock_irqsave(&host_thread_lock, flags); +#if 0 + if (!host_threads) { + save_syscalls(); + register_trace_sched_process_exit(process_exit_prober, NULL); + } +#endif + thread->next = host_threads; + host_threads = thread; + write_unlock_irqrestore(&host_thread_lock, flags); +#if 0 + mod_syscalls(); +#endif + + return 0; +} + +long +mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file) +{ + int tid = task_pid_vnr(current); + int pid = task_tgid_vnr(current); + unsigned long flags; + struct host_thread *thread; + + read_lock_irqsave(&host_thread_lock, flags); + for (thread = host_threads; thread; thread = thread->next) + if(thread->pid == pid && thread->tid == tid) + break; + read_unlock_irqrestore(&host_thread_lock, flags); + if (thread) { + if (arg) + restore_fs(thread->lfs); + else + restore_fs(thread->rfs); + return 0; + } + return -EINVAL; +} + +long +mcexec_switch_thread(ihk_os_t os, unsigned long code, struct file *file) +{ + int tid = task_pid_vnr(current); + int pid = task_tgid_vnr(current); + unsigned long flags; + struct host_thread *thread; + struct host_thread *prev; + struct ikc_scd_packet *packet; + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct mcctrl_per_proc_data *ppd; + + write_lock_irqsave(&host_thread_lock, flags); + for (prev = NULL, thread = host_threads; thread; + prev = thread, thread = thread->next) + if(thread->tid == tid) + break; + if (!thread) { + write_unlock_irqrestore(&host_thread_lock, flags); + return -EINVAL; + } + + ppd = mcctrl_get_per_proc_data(usrdata, pid); + if (!ppd) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + goto err; + } + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, + current); + if (!packet) { + kprintf("%s: ERROR: no packet registered for TID %d\n", + __FUNCTION__, tid); + goto err; + } + mcctrl_delete_per_thread_data(ppd, current); + __return_syscall(usrdata->os, packet, code, tid); + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); +err: + if(ppd) + mcctrl_put_per_proc_data(ppd); + + if (prev) + prev->next = thread->next; + else + host_threads = thread->next; + write_unlock_irqrestore(&host_thread_lock, flags); + kfree(thread); +#if 0 + read_lock_irqsave(&host_thread_lock, flags); + if (!host_threads) { + restore_syscalls(); + unregister_trace_sched_process_exit(process_exit_prober, NULL); + } + read_unlock_irqrestore(&host_thread_lock, flags); +#endif + set_user_sp(thread->usp); + return 0; +} + +long +mcexec_terminate_thread(ihk_os_t os, unsigned long *param, struct file *file) +{ + int pid = param[0]; + int tid = param[1]; + struct task_struct *tsk = (struct task_struct *)param[3]; + unsigned long flags; + struct host_thread *thread; + struct host_thread *prev; + struct ikc_scd_packet *packet; + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct mcctrl_per_proc_data *ppd; + +printk("mcexec_terminate_thread\n"); + write_lock_irqsave(&host_thread_lock, flags); + for (prev = NULL, thread = host_threads; thread; + prev = thread, thread = thread->next) { +printk("thread tid=%d\n", thread->tid); + if(thread->tid == tid) + break; + } + if (!thread) { + write_unlock_irqrestore(&host_thread_lock, flags); +printk("mcexec_terminate_thread no thread pid=%d tid=%d\n", pid, tid); + return -EINVAL; + } + + ppd = mcctrl_get_per_proc_data(usrdata, pid); + if (!ppd) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, pid); + goto err; + } + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, tsk); + if (!packet) { + kprintf("%s: ERROR: no packet registered for TID %d\n", + __FUNCTION__, tid); + goto err; + } + mcctrl_delete_per_thread_data(ppd, tsk); + __return_syscall(usrdata->os, packet, param[2], tid); + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); +err: + if(ppd) + mcctrl_put_per_proc_data(ppd); + + if (prev) + prev->next = thread->next; + else + host_threads = thread->next; + write_unlock_irqrestore(&host_thread_lock, flags); + kfree(thread); + return 0; +} + +long +mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file) +{ + struct syscall_struct { + int number; + unsigned long args[6]; + unsigned long ret; + }; + struct syscall_struct param; + struct syscall_struct __user *uparam = + (struct syscall_struct __user *)arg; + int rc; + + if (copy_from_user(¶m, uparam, sizeof param)) { + return -EFAULT; + } + if (param.number == __NR_exit || + param.number == __NR_exit_group) { + unsigned long code = (param.args[0] & 255) << 8; + if (param.number == __NR_exit_group) + code |= 0x100000000L; + mcexec_switch_thread(os, code, file); + return 0; + } + rc = syscall_backward(ihk_host_os_get_usrdata(os), param.number, + param.args[0], param.args[1], param.args[2], + param.args[3], param.args[4], param.args[5], + ¶m.ret); + + if (copy_to_user(&uparam->ret, ¶m.ret, sizeof(unsigned long))) { + return -EFAULT; + } + return rc; +} + +long +mcexec_copy_from_mck(ihk_os_t os, unsigned long *arg) +{ + void __user *to = (void *)arg[0]; + void *from = phys_to_virt(arg[1]); + long len = arg[2]; + + if (copy_to_user(to, from, len)) { + return -EFAULT; + } + return 0; +} + +long +mcexec_copy_to_mck(ihk_os_t os, unsigned long *arg) +{ + void *to = phys_to_virt(arg[0]); + void __user *from = (void *)arg[1]; + long len = arg[2]; + + if (copy_from_user(to, from, len)) { + return -EFAULT; + } + return 0; +} + long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg, struct file *file) { @@ -2087,6 +2698,30 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg, case MCEXEC_UP_SYS_UNSHARE: return mcexec_sys_unshare((struct sys_unshare_desc *)arg); + case MCEXEC_UP_UTIL_THREAD1: + return mcexec_util_thread1(os, arg, file); + + case MCEXEC_UP_UTIL_THREAD2: + return mcexec_util_thread2(os, arg, file); + + case MCEXEC_UP_SIG_THREAD: + return mcexec_sig_thread(os, arg, file); + + case MCEXEC_UP_SWITCH_THREAD: + return mcexec_switch_thread(os, arg, file); + + case MCEXEC_UP_SYSCALL_THREAD: + return mcexec_syscall_thread(os, arg, file); + + case MCEXEC_UP_TERMINATE_THREAD: + return mcexec_terminate_thread(os, (unsigned long *)arg, file); + + case MCEXEC_UP_COPY_FROM_MCK: + return mcexec_copy_from_mck(os, (unsigned long *)arg); + + case MCEXEC_UP_COPY_TO_MCK: + return mcexec_copy_to_mck(os, (unsigned long *)arg); + case MCEXEC_UP_DEBUG_LOG: return mcexec_debug_log(os, arg); diff --git a/executer/kernel/mcctrl/driver.c b/executer/kernel/mcctrl/driver.c index 513b0d22..59728847 100644 --- a/executer/kernel/mcctrl/driver.c +++ b/executer/kernel/mcctrl/driver.c @@ -81,7 +81,15 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = { { .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_UTIL_THREAD1, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_UTIL_THREAD2, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_SWITCH_THREAD, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_COPY_FROM_MCK, .func = mcctrl_ioctl }, + { .request = MCEXEC_UP_COPY_TO_MCK, .func = mcctrl_ioctl }, { .request = IHK_OS_AUX_PERF_NUM, .func = mcctrl_ioctl }, { .request = IHK_OS_AUX_PERF_SET, .func = mcctrl_ioctl }, { .request = IHK_OS_AUX_PERF_GET, .func = mcctrl_ioctl }, diff --git a/executer/kernel/mcctrl/syscall.c b/executer/kernel/mcctrl/syscall.c index b68ee89f..efabb0bc 100644 --- a/executer/kernel/mcctrl/syscall.c +++ b/executer/kernel/mcctrl/syscall.c @@ -278,6 +278,174 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet return ret; } +long syscall_backward(struct mcctrl_usrdata *usrdata, int num, + unsigned long arg1, unsigned long arg2, + unsigned long arg3, unsigned long arg4, + unsigned long arg5, unsigned long arg6, + unsigned long *ret) +{ + struct ikc_scd_packet *packet; + struct syscall_request *req; + struct syscall_response *resp; + unsigned long syscall_ret; + struct wait_queue_head_list_node *wqhln; + unsigned long irqflags; + struct mcctrl_per_proc_data *ppd; + unsigned long phys; + struct syscall_request _request[2]; + struct syscall_request *request; + + if (((unsigned long)_request ^ (unsigned long)(_request + 1)) & + ~(PAGE_SIZE -1)) + request = _request + 1; + else + request = _request; + request->number = num; + request->args[0] = arg1; + request->args[1] = arg2; + request->args[2] = arg3; + request->args[3] = arg4; + request->args[4] = arg5; + request->args[5] = arg6; + + + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + + if (!ppd) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return -EINVAL; + } + + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current); + if (!packet) { + syscall_ret = -ENOENT; + printk("%s: no packet registered for TID %d\n", + __FUNCTION__, task_pid_vnr(current)); + goto out_put_ppd; + } + + req = &packet->req; + + /* Map response structure */ + phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), + packet->resp_pa, sizeof(*resp)); + resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), + phys, sizeof(*resp), NULL, 0); + +retry_alloc: + wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC); + if (!wqhln) { + printk("WARNING: coudln't alloc wait queue head, retrying..\n"); + goto retry_alloc; + } + + /* Prepare per-thread wait queue head */ + wqhln->task = current; + /* Save the TID explicitly, because mcexec_syscall(), where the request + * will be matched, is in IRQ context and can't call task_pid_vnr() */ + wqhln->rtid = task_pid_vnr(current); + wqhln->req = 0; + init_waitqueue_head(&wqhln->wq_syscall); + + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + /* Add to exact list */ + list_add_tail(&wqhln->list, &ppd->wq_list_exact); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); + + resp->stid = task_pid_vnr(current); + resp->fault_address = virt_to_phys(request); + +#define STATUS_IN_PROGRESS 0 +#define STATUS_SYSCALL 4 + req->valid = 0; + + if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) { + printk("%s: WARNING: failed to notify PID %d\n", + __FUNCTION__, packet->pid); + } + + mb(); + resp->status = STATUS_SYSCALL; + + dprintk("%s: tid: %d, syscall: %d SLEEPING\n", + __FUNCTION__, task_pid_vnr(current), num); + /* wait for response */ + syscall_ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); + + /* Remove per-thread wait queue head */ + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + list_del(&wqhln->list); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); + + dprintk("%s: tid: %d, syscall: %d WOKEN UP\n", + __FUNCTION__, task_pid_vnr(current), num); + + if (syscall_ret) { + kfree(wqhln); + goto out; + } + else { + unsigned long phys2; + struct syscall_response *resp2; + + /* Update packet reference */ + packet = wqhln->packet; + req = &packet->req; + phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), + packet->resp_pa, sizeof(*resp)); + resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), + phys2, sizeof(*resp), NULL, 0); + + if (resp != resp2) { + resp = resp2; + phys = phys2; + printk("%s: updated new remote PA for resp\n", __FUNCTION__); + } + } + + if (!req->valid) { + printk("%s:not valid\n", __FUNCTION__); + } + req->valid = 0; + + /* check result */ + if (req->number != __NR_mmap) { + printk("%s:unexpected response. %lx %lx\n", + __FUNCTION__, req->number, req->args[0]); + syscall_ret = -EIO; + goto out; + } +#define PAGER_REQ_RESUME 0x0101 + else if (req->args[0] != PAGER_REQ_RESUME) { + resp->ret = pager_call(usrdata->os, (void *)req); + + if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) { + printk("%s: WARNING: failed to notify PID %d\n", + __FUNCTION__, packet->pid); + } + + mb(); + } + else { + *ret = req->args[1]; + } + + kfree(wqhln); + syscall_ret = 0; +out: + ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp)); + ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp)); + +out_put_ppd: + dprintk("%s: tid: %d, syscall: %d, reason: %lu, syscall_ret: %d\n", + __FUNCTION__, task_pid_vnr(current), num, reason, syscall_ret); + + mcctrl_put_per_proc_data(ppd); + return syscall_ret; +} + static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) { struct ikc_scd_packet *packet; @@ -598,7 +766,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); - + /* Look up per-process structure */ ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (!ppd) { @@ -608,6 +776,8 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!ppd) { kprintf("%s: ERROR: no per-process structure for PID %d??\n", __FUNCTION__, task_tgid_vnr(current)); +printk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", +vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); return -EINVAL; } @@ -759,11 +929,11 @@ reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, u original = override_creds(promoted); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) - start = vm_mmap_pgoff(file, start, end, - PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0); + start = vm_mmap_pgoff(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_SHARED, 0); #else - start = vm_mmap(file, start, end, - PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0); + start = vm_mmap(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_FIXED|MAP_SHARED, 0); #endif revert_creds(original); diff --git a/executer/user/Makefile.in b/executer/user/Makefile.in index 6d3b2b23..d805f924 100644 --- a/executer/user/Makefile.in +++ b/executer/user/Makefile.in @@ -1,18 +1,22 @@ CC=@CC@ BINDIR=@BINDIR@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +LIBDIR=@libdir@ MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@ KDIR ?= @KDIR@ -CFLAGS=-Wall -O -I. +CFLAGS=-Wall -O -I. -Iarch/${ARCH} VPATH=@abs_srcdir@ TARGET=mcexec libsched_yield @uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair LIBS=@LIBS@ +ARCH=@ARCH@ IHKDIR ?= $(VPATH)/../../../ihk/linux/include/ all: $(TARGET) -mcexec: mcexec.c - $(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -lnuma -pthread -o $@ $^ $(EXTRA_OBJS) +mcexec: mcexec.c libmcexec.a + $(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -DLIBDIR=\"$(LIBDIR)\" -fPIE -pie -L. -lmcexec -lrt -lnuma -pthread -o $@ $^ $(EXTRA_OBJS) eclair: eclair.c $(CC) $(CFLAGS) -I${IHKDIR} -o $@ $^ $(LIBS) @@ -20,12 +24,17 @@ eclair: eclair.c libsched_yield: libsched_yield.c $(CC) -shared -fPIC -Wl,-soname,sched_yield.so.1 -o libsched_yield.so.1.0.0 $^ -lc -ldl -clean: +libmcexec.a:: + (cd arch/${ARCH}; make) + +clean:: + (cd arch/${ARCH}; make clean) $(RM) $(TARGET) *.o .PHONY: all clean install -install: +install:: + (cd arch/${ARCH}; make install) mkdir -p -m 755 $(BINDIR) install -m 755 mcexec $(BINDIR) mkdir -p -m 755 $(MCKERNEL_LIBDIR) diff --git a/executer/user/arch/x86_64/arch_args.h b/executer/user/arch/x86_64/arch_args.h new file mode 100644 index 00000000..9cc99839 --- /dev/null +++ b/executer/user/arch/x86_64/arch_args.h @@ -0,0 +1,113 @@ +#ifndef ARCH_ARGS_H +#define ARCH_ARGS_H + +typedef struct user_regs_struct syscall_args; + +static inline int +get_syscall_args(int pid, syscall_args *args) +{ + return ptrace(PTRACE_GETREGS, pid, NULL, args); +} + +static inline int +set_syscall_args(int pid, syscall_args *args) +{ + return ptrace(PTRACE_SETREGS, pid, NULL, args); +} + +static inline unsigned long +get_syscall_number(syscall_args *args) +{ + return args->orig_rax; +} + +static inline unsigned long +get_syscall_return(syscall_args *args) +{ + return args->rax; +} + +static inline unsigned long +get_syscall_arg1(syscall_args *args) +{ + return args->rdi; +} + +static inline unsigned long +get_syscall_arg2(syscall_args *args) +{ + return args->rsi; +} + +static inline unsigned long +get_syscall_arg3(syscall_args *args) +{ + return args->rdx; +} + +static inline unsigned long +get_syscall_arg4(syscall_args *args) +{ + return args->r10; +} + +static inline unsigned long +get_syscall_arg5(syscall_args *args) +{ + return args->r8; +} + +static inline unsigned long +get_syscall_arg6(syscall_args *args) +{ + return args->r9; +} + +static inline void +set_syscall_number(syscall_args *args, unsigned long value) +{ + args->orig_rax = value; +} + +static inline void +set_syscall_return(syscall_args *args, unsigned long value) +{ + args->rax = value; +} + +static inline void +set_syscall_arg1(syscall_args *args, unsigned long value) +{ + args->rdi = value; +} + +static inline void +set_syscall_arg2(syscall_args *args, unsigned long value) +{ + args->rsi = value; +} + +static inline void +set_syscall_arg3(syscall_args *args, unsigned long value) +{ + args->rdx = value; +} + +static inline void +set_syscall_arg4(syscall_args *args, unsigned long value) +{ + args->r10 = value; +} + +static inline void +set_syscall_arg5(syscall_args *args, unsigned long value) +{ + args->r8 = value; +} + +static inline void +set_syscall_arg6(syscall_args *args, unsigned long value) +{ + args->r9 = value; +} +#endif diff --git a/executer/user/arch/x86_64/archdep.S b/executer/user/arch/x86_64/archdep.S new file mode 100644 index 00000000..c4da1ef6 --- /dev/null +++ b/executer/user/arch/x86_64/archdep.S @@ -0,0 +1,149 @@ +/* +arg: rdi, rsi, rdx, rcx, r8, r9 +ret: rax + +rax syscall number +syscall: (rax:num) rdi rsi rdx r10 r8 r9 (rcx:ret addr) +fd, cmd, param +rdi: fd +rsi: cmd +rdx: param +rcx: save area +r8: new thread context +*/ + +.global switch_ctx +switch_ctx: + movq $0,0x00(%rcx) + movq %rax,0x8(%rcx) + movq %rbx,0x10(%rcx) + movq %rcx,0x18(%rcx) + movq %rdx,0x20(%rcx) + movq %rsi,0x28(%rcx) + movq %rdi,0x30(%rcx) + movq %rbp,0x38(%rcx) + movq %r8,0x40(%rcx) + movq %r9,0x48(%rcx) + movq %r10,0x50(%rcx) + movq %r11,0x58(%rcx) + movq %r12,0x60(%rcx) + movq %r13,0x68(%rcx) + movq %r14,0x70(%rcx) + movq %r15,0x78(%rcx) + pushfq + popq %rax + movq %rax,0x80(%rcx) + movq 0x00(%rsp),%rax + movq %rax,0x88(%rcx) + movq %rsp,0x90(%rcx) + movq %rcx,%r10 + + pushq %rcx + pushq %r8 + pushq %rax + + mov $0x10,%eax /* ioctl */ + syscall +3: + + popq %r8 + popq %r8 + popq %rcx + + movq %r10,%rcx + cmp $0xfffffffffffff001,%eax + jae 1f + + test %eax,%eax + jnz 2f + + pushq %rax + movq $158,%rax /* arch_prctl */ + movq $0x1002,%rdi /* ARCH_SET_FS */ + movq 0x98(%r8),%rsi + syscall + popq %rax + + movq 0x10(%r8),%rbx + movq 0x18(%r8),%rcx + movq 0x20(%r8),%rdx + movq 0x28(%r8),%rsi + movq 0x30(%r8),%rdi + movq 0x38(%r8),%rbp + movq 0x48(%r8),%r9 + movq 0x50(%r8),%r10 + movq 0x58(%r8),%r11 + movq 0x60(%r8),%r12 + movq 0x68(%r8),%r13 + movq 0x70(%r8),%r14 + movq 0x78(%r8),%r15 + movq 0x80(%r8),%rax + pushq %rax + popfq + movq 0x90(%r8),%rsp +// movq 0x8(%r8),%rax /* for interrupts */ + movq 0x40(%r8),%r8 + + movq $0,%rax /* ioctl return */ + + pushq %rcx + retq + +1: + mov $0xffffffffffffffff,%eax +2: + pushq %rax + movq $158,%rax /* arch_prctl */ + movq $0x1002,%rdi /* ARCH_SET_FS */ + movq 0x98(%rcx),%rsi + syscall + popq %rax + + movq 0x10(%rcx),%rbx + movq 0x28(%rcx),%rsi + movq 0x30(%rcx),%rdi + movq 0x38(%rcx),%rbp + movq 0x40(%rcx),%r8 + movq 0x48(%rcx),%r9 + movq 0x50(%rcx),%r10 + movq 0x58(%rcx),%r11 + movq 0x60(%rcx),%r12 + movq 0x68(%rcx),%r13 + movq 0x70(%rcx),%r14 + movq 0x78(%rcx),%r15 + movq 0x80(%rcx),%rdx + pushq %rdx + popfq + movq 0x20(%rcx),%rdx + movq 0x18(%rcx),%rcx + retq + +/* +arg: rdi, rsi, rdx, rcx, r8, r9 +ret: rax +unsigned long +compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new); +rdi: addr +rsi: old +rdx: new +RET: old value + */ +.global compare_and_swap +compare_and_swap: + movq %rsi,%rax + lock + cmpxchgq %rdx,0(%rdi) + retq + +/* +unsigned int +compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new); +ret: old value + */ +.global compare_and_swap_int +compare_and_swap_int: + movl %esi,%eax + lock + cmpxchgl %edx,0(%rdi) + retq + diff --git a/executer/user/archdep.h b/executer/user/archdep.h new file mode 100644 index 00000000..47f33142 --- /dev/null +++ b/executer/user/archdep.h @@ -0,0 +1,3 @@ +extern int switch_ctx(int fd, unsigned long cmd, void **param, void *lctx, void *rctx); +extern unsigned long compare_and_swap(unsigned long *addr, unsigned long old, unsigned long new); +extern unsigned int compare_and_swap_int(unsigned int *addr, unsigned int old, unsigned int new); diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 3867a0d3..e8da54bb 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include @@ -63,8 +64,12 @@ #include #include #include +#include +#include #include "../include/uprotocol.h" #include +#include "archdep.h" +#include "arch_args.h" #include "../../config.h" #include #include @@ -85,6 +90,8 @@ __VA_ARGS__);fflush(stderr);} #endif +#undef DEBUG_UTI + #ifdef USE_SYSCALL_MOD_CALL extern int mc_cmd_server_init(); extern void mc_cmd_server_exit(); @@ -131,6 +138,13 @@ struct sigfd { struct sigfd *sigfdtop; + +struct syscall_struct { + int number; + unsigned long args[6]; + unsigned long ret; +}; + #ifdef NCCS #undef NCCS #endif @@ -145,7 +159,42 @@ struct kernel_termios { cc_t c_cc[NCCS]; /* control characters */ }; -int main_loop(int fd, int cpu, pthread_mutex_t *lock); +#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */ + +#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2) +#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3) + +#define UTI_FLAG_SAME_L1 (1ULL<<4) +#define UTI_FLAG_SAME_L2 (1ULL<<5) +#define UTI_FLAG_SAME_L3 (1ULL<<6) + +#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7) +#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8) +#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9) + +#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10) +#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11) +#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12) +#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13) + +/* Linux default value is used */ +#define UTI_MAX_NUMA_DOMAINS (1024) + +typedef struct uti_attr { + /* UTI_CPU_SET environmental variable is used to denote the preferred + location of utility thread */ + uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) / + (sizeof(uint64_t) * 8)]; + uint64_t flags; /* Representing location and behavior hints by bitmap */ +} uti_attr_t; + +struct kuti_attr { + long parent_cpuid; + struct uti_attr attr; +}; + +struct thread_data_s; +int main_loop(struct thread_data_s *); static int mcosid; static int fd; @@ -188,6 +237,11 @@ pid_t gettid(void) return syscall(SYS_gettid); } +int tgkill(int tgid, int tid, int sig) +{ + return syscall(SYS_tgkill, tgid, tid, sig); +} + struct program_load_desc *load_elf(FILE *fp, char **interp_pathp) { Elf64_Ehdr hdr; @@ -893,13 +947,15 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat) //#define NUM_HANDLER_THREADS 248 struct thread_data_s { + struct thread_data_s *next; pthread_t thread_id; - int fd; int cpu; int ret; pid_t tid; int terminate; int remote_tid; + int remote_cpu; + int joined; pthread_mutex_t *lock; pthread_barrier_t *init_ready; } *thread_data; @@ -918,8 +974,9 @@ static void *main_loop_thread_func(void *arg) td->tid = gettid(); td->remote_tid = -1; - pthread_barrier_wait(&init_ready); - td->ret = main_loop(td->fd, td->cpu, td->lock); + if (td->init_ready) + pthread_barrier_wait(td->init_ready); + td->ret = main_loop(td); return NULL; } @@ -929,54 +986,91 @@ static void *main_loop_thread_func(void *arg) void sendsig(int sig, siginfo_t *siginfo, void *context) { - pid_t pid = getpid(); - pid_t tid = gettid(); + pid_t pid; + pid_t tid; int remote_tid; - int i; int cpu; struct signal_desc sigdesc; + struct thread_data_s *tp; + int localthread; - if(siginfo->si_pid == pid && - siginfo->si_signo == LOCALSIG) - return; + localthread = ioctl(fd, MCEXEC_UP_SIG_THREAD, 1); + pid = getpid(); + tid = gettid(); + if (siginfo->si_pid == pid && + siginfo->si_signo == LOCALSIG) + goto out; - if(siginfo->si_signo == SIGCHLD) - return; + if (siginfo->si_signo == SIGCHLD) + goto out; - for(i = 0; i < ncpu; i++){ - if(siginfo->si_pid == pid && - thread_data[i].tid == tid){ - if(thread_data[i].terminate) - return; + for (tp = thread_data; tp; tp = tp->next) { + if (siginfo->si_pid == pid && + tp->tid == tid) { + if (tp->terminate) + goto out; break; } - if(siginfo->si_pid != pid && - thread_data[i].remote_tid == tid){ - if(thread_data[i].terminate) - return; + if (siginfo->si_pid != pid && + tp->remote_tid == tid) { + if (tp->terminate) + goto out; break; } } - if(i != ncpu){ - remote_tid = thread_data[i].remote_tid; - cpu = thread_data[i].cpu; + if (tp) { + remote_tid = tp->remote_tid; + cpu = tp->remote_cpu; } - else{ + else { cpu = 0; remote_tid = -1; } - memset(&sigdesc, '\0', sizeof sigdesc); - sigdesc.cpu = cpu; - sigdesc.pid = (int)pid; - sigdesc.tid = remote_tid; - sigdesc.sig = sig; - memcpy(&sigdesc.info, siginfo, 128); - if (ioctl(fd, MCEXEC_UP_SEND_SIGNAL, &sigdesc) != 0) { - perror("send_signal"); - close(fd); - exit(1); + if (localthread) { + memset(&sigdesc, '\0', sizeof sigdesc); + sigdesc.cpu = cpu; + sigdesc.pid = (int)pid; + sigdesc.tid = remote_tid; + sigdesc.sig = sig; + memcpy(&sigdesc.info, siginfo, 128); + if (ioctl(fd, MCEXEC_UP_SEND_SIGNAL, &sigdesc) != 0) { + close(fd); + exit(1); + } } + else { + struct syscall_struct param; + int rc; + + param.number = SYS_rt_sigaction; + param.args[0] = sig; + rc = ioctl(fd, MCEXEC_UP_SYSCALL_THREAD, ¶m); + if (rc == -1); + else if (param.ret == (unsigned long)SIG_IGN); + else if (param.ret == (unsigned long)SIG_DFL) { + if (sig != SIGCHLD && sig != SIGURG && sig != SIGCONT) { + signal(sig, SIG_DFL); + kill(getpid(), sig); + for(;;) + sleep(1); +#if 0 + ioctl(fd, MCEXEC_UP_SWITCH_THREAD, + 0x100000000 | sig); + pthread_exit(NULL); +#endif + } + } + else { + ioctl(fd, MCEXEC_UP_SIG_THREAD, 0); + ((void (*)(int, siginfo_t *, void *))param.ret)(sig, + siginfo, context); + ioctl(fd, MCEXEC_UP_SIG_THREAD, 1); + } + } +out: + if (!localthread) + ioctl(fd, MCEXEC_UP_SIG_THREAD, 0); } long @@ -1137,7 +1231,29 @@ void init_sigaction(void) sigaction(i, &act, NULL); } } -} +} + +static int max_cpuid; + +static int +create_worker_thread(pthread_barrier_t *init_ready) +{ + struct thread_data_s *tp; + + tp = malloc(sizeof(struct thread_data_s)); + if (!tp) + return ENOMEM; + memset(tp, '\0', sizeof(struct thread_data_s)); + tp->cpu = max_cpuid++; + tp->lock = &lock; + tp->init_ready = init_ready; + tp->terminate = 0; + tp->next = thread_data; + thread_data = tp; + + return pthread_create(&tp->thread_id, NULL, + &main_loop_thread_func, tp); +} void init_worker_threads(int fd) { @@ -1146,19 +1262,12 @@ void init_worker_threads(int fd) pthread_mutex_init(&lock, NULL); pthread_barrier_init(&init_ready, NULL, n_threads + 2); + max_cpuid = 0; for (i = 0; i <= n_threads; ++i) { - int ret; + int ret = create_worker_thread(&init_ready); - thread_data[i].fd = fd; - thread_data[i].cpu = i; - thread_data[i].lock = &lock; - thread_data[i].init_ready = &init_ready; - thread_data[i].terminate = 0; - ret = pthread_create(&thread_data[i].thread_id, NULL, - &main_loop_thread_func, &thread_data[i]); - - if (ret < 0) { - printf("ERROR: creating syscall threads\n"); + if (ret) { + printf("ERROR: creating syscall threads(%d)\n", ret); exit(1); } } @@ -1438,13 +1547,44 @@ void bind_mount_recursive(const char *root, char *prefix) } #endif +static void +join_all_threads() +{ + struct thread_data_s *tp; + int live_thread; + + do { + live_thread = 0; + for (tp = thread_data; tp; tp = tp->next) { + if (tp->joined) + continue; + live_thread = 1; + pthread_join(tp->thread_id, NULL); + tp->joined = 1; + } + } while (live_thread); +} + +static int +opendev() +{ + int f; + + sprintf(dev, "/dev/mcos%d", mcosid); + + /* Open OS chardev for ioctl() */ + f = open(dev, O_RDWR); + if (f < 0) { + fprintf(stderr, "Error: Failed to open %s.\n", dev); + return -1; + } + fd = f; + + return fd; +} + int main(int argc, char **argv) { -// int fd; -#if 0 - int fdm; - long r; -#endif struct program_load_desc *desc; int envs_len; char *envs; @@ -1460,6 +1600,7 @@ int main(int argc, char **argv) char path[1024]; char *shell = NULL; char shell_path[1024]; + int num = 0; #ifdef USE_SYSCALL_MOD_CALL __glob_argc = argc; @@ -1521,30 +1662,19 @@ int main(int argc, char **argv) /* Determine OS device */ if (isdigit(*argv[optind])) { - mcosid = atoi(argv[optind]); + num = atoi(argv[optind]); ++optind; } - sprintf(dev, "/dev/mcos%d", mcosid); - /* No more arguments? */ if (optind >= argc) { print_usage(argv); exit(EXIT_FAILURE); } - __dprintf("target_core: %d, device: %s, command: ", target_core, dev); - for (i = optind; i < argc; ++i) { - __dprintf("%s ", argv[i]); - } - __dprintf("%s", "\n"); - - /* Open OS chardev for ioctl() */ - fd = open(dev, O_RDWR); - if (fd < 0) { - fprintf(stderr, "Error: Failed to open %s.\n", dev); - return 1; - } + mcosid = num; + if (opendev() == -1) + exit(EXIT_FAILURE); if (disable_sched_yield) { char sched_yield_lib_path[PATH_MAX]; @@ -1567,7 +1697,6 @@ int main(int argc, char **argv) /* Collect environment variables */ envs_len = flatten_strings(-1, NULL, environ, &envs); - envs = envs; #ifdef ENABLE_MCOVERLAYFS __dprintf("mcoverlay enable\n"); @@ -1798,12 +1927,14 @@ int main(int argc, char **argv) * TODO: fix signaling code to be independent of TIDs. * TODO: implement dynaic thread pool resizing. */ +#if 0 thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1)); if (!thread_data) { fprintf(stderr, "error: allocating thread pool data\n"); return 1; } memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1)); +#endif #if 0 fdm = open("/dev/fmem", O_RDWR); @@ -1991,9 +2122,7 @@ int main(int argc, char **argv) return 1; } - for (i = 0; i <= n_threads; ++i) { - pthread_join(thread_data[i].thread_id, NULL); - } + join_all_threads(); return 0; } @@ -2145,18 +2274,339 @@ out: } static void -kill_thread(unsigned long tid) +kill_thread(unsigned long tid, int sig) { - int i; + struct thread_data_s *tp; - for (i = 0; i <= n_threads; ++i) { - if(thread_data[i].remote_tid == tid){ - pthread_kill(thread_data[i].thread_id, LOCALSIG); + if (sig == 0) + sig = LOCALSIG; + + for (tp = thread_data; tp; tp = tp->next) { + if (tp->remote_tid == tid) { + pthread_kill(tp->thread_id, sig); break; } } } +static int +samepage(void *a, void *b) +{ + unsigned long aa = (unsigned long)a; + unsigned long bb = (unsigned long)b; + + return (aa & PAGE_MASK) == (bb & PAGE_MASK); +} + +#ifdef DEBUG_UTI +long syscalls[512]; + +static void +debug_sig(int s) +{ + int i; + for (i = 0; i < 512; i++) + if (syscalls[i]) + fprintf(stderr, "syscall %d called %ld\n", i, + syscalls[i]); +} +#endif + +static int +create_tracer(void *wp, int mck_tid, unsigned long key) +{ + int pid = getpid(); + int tid = gettid(); + int pfd[2]; + int tpid; + int rc; + int st; + int sig = 0; + int i; + struct syscall_struct *param_top = NULL; + struct syscall_struct *param; + unsigned long code = 0; + int exited = 0; + int mode = 0; + + if (pipe(pfd) == -1) + return -1; + tpid = fork(); + if (tpid) { + struct timeval tv; + fd_set rfd; + + if (tpid == -1) + return -1; + close(pfd[1]); + while ((rc = waitpid(tpid, &st, 0)) == -1 && errno == EINTR); + if (rc == -1 || !WIFEXITED(st) || WEXITSTATUS(st)) { + fprintf(stderr, "waitpid rc=%d st=%08x\n", rc, st); + return -ENOMEM; + } + FD_ZERO(&rfd); + FD_SET(pfd[0], &rfd); + tv.tv_sec = 1; + tv.tv_usec = 0; + while ((rc = select(pfd[0] + 1, &rfd, NULL, NULL, &tv)) == -1 && + errno == EINTR); + if (rc == 0) { + close(pfd[0]); + return -ETIMEDOUT; + } + if (rc == -1) { + close(pfd[0]); + return -errno; + } + rc = read(pfd[0], &st, 1); + close(pfd[0]); + if (rc != 1) { + return -EAGAIN; + } + return 0; + } + close(pfd[0]); + tpid = fork(); + if (tpid) { + if (tpid == -1) { + fprintf(stderr, "fork errno=%d\n", errno); + exit(1); + } + exit(0); + } + if (ptrace(PTRACE_ATTACH, tid, 0, 0) == -1) { + fprintf(stderr, "PTRACE_ATTACH errno=%d\n", errno); + exit(1); + } + waitpid(-1, &st, __WALL); + if (ptrace(PTRACE_SETOPTIONS, tid, 0, PTRACE_O_TRACESYSGOOD) == -1) { + fprintf(stderr, "PTRACE_SETOPTIONS errno=%d\n", errno); + exit(1); + } + write(pfd[1], " ", 1); + close(pfd[1]); + + for (i = 0; i < 4096; i++) + if (i != fd && i != 2) + close(i); + open("/dev/null", O_RDONLY); + open("/dev/null", O_WRONLY); +// open("/dev/null", O_WRONLY); + + for (i = 1; i <= 10; i++) { + param = (struct syscall_struct *)wp + i; + *(void **)param = param_top; + param_top = param; + } + memset(wp, '\0', sizeof(long)); + +fprintf(stderr, "tracer PID=%d\n", getpid()); +#ifdef DEBUG_UTI + fprintf(stderr, "tracer PID=%d\n", getpid()); + signal(SIGINT, debug_sig); +#endif + for (;;) { + ptrace(PTRACE_SYSCALL, tid, 0, sig); + sig = 0; + waitpid(-1, &st, __WALL); + if (WIFEXITED(st) || WIFSIGNALED(st)) { + unsigned long term_param[4]; + + term_param[0] = pid; + term_param[1] = tid; + term_param[3] = key; + code = st; + if (exited == 2 || // exit_group + WIFSIGNALED(st)) { + code |= 0x0000000100000000; + } + term_param[2] = code; + ioctl(fd, MCEXEC_UP_TERMINATE_THREAD, term_param); + break; + } + if (!WIFSTOPPED(st)) { + continue; + } + if (WSTOPSIG(st) & 0x80) { // syscall + syscall_args args; + + get_syscall_args(tid, &args); + +#ifdef DEBUG_UTI + if (get_syscall_return(&args) == -ENOSYS) { + if (get_syscall_number(&args) >= 0 && + get_syscall_number(&args) < 512) { + syscalls[get_syscall_number(&args)]++; + } + } +#endif + + if (get_syscall_number(&args) == __NR_ioctl && + get_syscall_return(&args) == -ENOSYS && + get_syscall_arg1(&args) == fd && + get_syscall_arg2(&args) == MCEXEC_UP_SIG_THREAD) { + mode = get_syscall_arg3(&args); + } + + if (mode) { + continue; + } + + switch (get_syscall_number(&args)) { + case __NR_gettid: + set_syscall_number(&args, -1); + set_syscall_return(&args, mck_tid); + set_syscall_args(tid, &args); + continue; + case __NR_futex: + case __NR_brk: + case __NR_mmap: + case __NR_munmap: + case __NR_mprotect: + case __NR_mremap: + break; + case __NR_exit_group: + exited++; + case __NR_exit: + exited++; + continue; + case __NR_clone: + case __NR_fork: + case __NR_vfork: + case __NR_execve: + set_syscall_number(&args, -1); + set_syscall_args(tid, &args); + continue; + case __NR_ioctl: + param = (struct syscall_struct *) + get_syscall_arg3(&args); + if (get_syscall_return(&args) != -ENOSYS && + get_syscall_arg1(&args) == fd && + get_syscall_arg2(&args) == + MCEXEC_UP_SYSCALL_THREAD && + samepage(wp, param)) { + set_syscall_arg1(&args, param->args[0]); + set_syscall_arg2(&args, param->args[1]); + set_syscall_arg3(&args, param->args[2]); + set_syscall_arg4(&args, param->args[3]); + set_syscall_arg5(&args, param->args[4]); + set_syscall_arg6(&args, param->args[5]); + set_syscall_return(&args, param->ret); + *(void **)param = param_top; + param_top = param; + set_syscall_args(tid, &args); + } + continue; + default: + continue; + } + param = param_top; + if (!param) { + set_syscall_number(&args, -1); + set_syscall_return(&args, -ENOMEM); + } + else { + param_top = *(void **)param; + param->number = get_syscall_number(&args); + param->args[0] = get_syscall_arg1(&args); + param->args[1] = get_syscall_arg2(&args); + param->args[2] = get_syscall_arg3(&args); + param->args[3] = get_syscall_arg4(&args); + param->args[4] = get_syscall_arg5(&args); + param->args[5] = get_syscall_arg6(&args); + param->ret = -EINVAL; + set_syscall_number(&args, __NR_ioctl); + set_syscall_arg1(&args, fd); + set_syscall_arg2(&args, + MCEXEC_UP_SYSCALL_THREAD); + set_syscall_arg3(&args, (unsigned long)param); + } + set_syscall_args(tid, &args); + } + else { // signal + sig = WSTOPSIG(st) & 0x7f; + } + } + +#ifdef DEBUG_UTI + fprintf(stderr, "offloaded thread called these syscalls\n"); + debug_sig(0); +#endif + + exit(0); +} + +static void +util_thread_setaffinity(unsigned long pattr) +{ + struct kuti_attr kattr; + unsigned long args[3]; + + args[0] = (unsigned long)&kattr; + args[1] = pattr; + args[2] = sizeof kattr; + if (ioctl(fd, MCEXEC_UP_COPY_FROM_MCK, args) == -1) { + return; + } + + + + +} + +static long +util_thread(unsigned long uctx_pa, int remote_tid, unsigned long pattr) +{ + void *lctx; + void *rctx; + void *wp; + void *param[6]; + int rc = 0; + + wp = mmap(NULL, PAGE_SIZE * 3, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (wp == (void *)-1) { + rc = -errno; + goto out; + } + lctx = (char *)wp + PAGE_SIZE; + rctx = (char *)lctx + PAGE_SIZE; + + param[0] = (void *)uctx_pa; + param[1] = rctx; + param[2] = lctx; + param[4] = wp; + param[5] = (void *)(PAGE_SIZE * 3); + if ((rc = ioctl(fd, MCEXEC_UP_UTIL_THREAD1, param)) == -1) { + fprintf(stderr, "util_thread1: %d errno=%d\n", rc, errno); + rc = -errno; + goto out; + } + + create_worker_thread(NULL); + if ((rc = create_tracer(wp, remote_tid, (unsigned long)param[3]))) { + fprintf(stderr, "create tracer %d\n", rc); + rc = -errno; + goto out; + } + + if (pattr) { + util_thread_setaffinity(pattr); + } + + if ((rc = switch_ctx(fd, MCEXEC_UP_UTIL_THREAD2, param, lctx, rctx)) + < 0) { + fprintf(stderr, "util_thread2: %d\n", rc); + } + fprintf(stderr, "return from util_thread2 rc=%d\n", rc); + pthread_exit(NULL); + +out: + if (wp) + munmap(wp, PAGE_SIZE * 3); + return rc; +} + static long do_strncpy_from_user(int fd, void *dest, void *src, unsigned long n) { struct strncpy_from_user_desc desc; @@ -2291,7 +2741,7 @@ chgpath(char *in, char *buf) return fn; } -int main_loop(int fd, int cpu, pthread_mutex_t *lock) +int main_loop(struct thread_data_s *my_thread) { struct syscall_wait_desc w; long ret; @@ -2301,6 +2751,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) struct timespec tv; char pathbuf[PATH_MAX]; char tmpbuf[PATH_MAX]; + int cpu = my_thread->cpu; memset(&w, '\0', sizeof w); w.cpu = cpu; @@ -2318,7 +2769,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) //pthread_mutex_lock(lock); - thread_data[cpu].remote_tid = w.sr.rtid; + my_thread->remote_tid = w.sr.rtid; + my_thread->remote_cpu = w.cpu; switch (w.sr.number) { case __NR_open: @@ -2350,7 +2802,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) break; case __NR_kill: // interrupt syscall - kill_thread(w.sr.args[1]); + kill_thread(w.sr.args[1], w.sr.args[2]); do_syscall_return(fd, cpu, 0, 0, 0, 0, 0); break; case __NR_exit: @@ -2423,6 +2875,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) */ if (w.sr.args[4] > 0) { struct remote_transfer trans; + struct thread_data_s *tp; int i = 0; int *tids = malloc(sizeof(int) * w.sr.args[4]); if (!tids) { @@ -2430,8 +2883,11 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) goto gettid_out; } - for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) { - tids[i] = thread_data[i].tid; + for (tp = thread_data; tp && i < w.sr.args[4]; + tp = tp->next) { + if (tp->joined || tp->terminate) + continue; + tids[i++] = tp->tid; } for (; i < ncpu; ++i) { @@ -2528,14 +2984,13 @@ gettid_out: /* Child process */ case 0: { - int i; int ret = 1; struct newprocess_desc npdesc; ischild = 1; /* Reopen device fd */ close(fd); - fd = open(dev, O_RDWR); + fd = opendev(); if (fd < 0) { fs->status = -errno; fprintf(stderr, "ERROR: opening %s\n", dev); @@ -2586,9 +3041,7 @@ fork_child_sync_pipe: ioctl(fd, MCEXEC_UP_NEW_PROCESS, &npdesc); /* TODO: does the forked thread run in a pthread context? */ - for (i = 0; i <= ncpu; ++i) { - pthread_join(thread_data[i].thread_id, NULL); - } + join_all_threads(); return ret; } @@ -2622,11 +3075,11 @@ fork_child_sync_pipe: munmap(fs, sizeof(struct fork_sync)); fork_err: pthread_mutex_lock(&fork_sync_mutex); - for(fp = fork_sync_top, fb = NULL; fp; fb = fp, fp = fp->next) - if(fp == fsc) + for (fp = fork_sync_top, fb = NULL; fp; fb = fp, fp = fp->next) + if (fp == fsc) break; - if(fp){ - if(fb) + if (fp) { + if (fb) fb->next = fsc->next; else fork_sync_top = fsc->next; @@ -2645,13 +3098,13 @@ fork_err: opt = WEXITED | (options & WNOWAIT); memset(&info, '\0', sizeof info); - while((ret = waitid(P_PID, pid, &info, opt)) == -1 && - errno == EINTR); - if(ret == 0){ + while ((ret = waitid(P_PID, pid, &info, opt)) == -1 && + errno == EINTR); + if (ret == 0) { ret = info.si_pid; } - if(ret != pid) { + if (ret != pid) { fprintf(stderr, "ERROR: waiting for %lu rc=%d errno=%d\n", w.sr.args[0], ret, errno); } @@ -2934,6 +3387,21 @@ return_execve2: do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; + case __NR_sched_setaffinity: + if (w.sr.args[0] == 0) { + ret = util_thread(w.sr.args[1], w.sr.rtid, + w.sr.args[2]); + } + else { + ret = munmap((void *)w.sr.args[1], + w.sr.args[2]); +if(ret == -1)fprintf(stderr, "munmap rc=%ld errno=%d addr=%p size=%d\n", ret, errno, (void *)w.sr.args[1], (int)w.sr.args[2]); + if (ret == -1) + ret = -errno; + } + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + default: ret = do_generic_syscall(&w); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); @@ -2941,7 +3409,7 @@ return_execve2: } - thread_data[cpu].remote_tid = -1; + my_thread->remote_tid = -1; //pthread_mutex_unlock(lock); } diff --git a/kernel/include/process.h b/kernel/include/process.h index 2642ae9e..67773fc4 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -231,6 +231,10 @@ enum mpol_rebind_step { #define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */ #define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */ +#define SPAWN_TO_LOCAL 0 +#define SPAWN_TO_REMOTE 1 +#define SPAWNING_TO_REMOTE 1001 + #include #include @@ -667,6 +671,11 @@ struct thread { /* Syscall offload wait queue head */ struct waitq scd_wq; + + int thread_offloaded; + int mod_clone; + struct uti_attr *mod_clone_arg; + int parent_cpuid; }; #define VM_RANGE_CACHE_SIZE 4 diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 6a8d0190..8e2e8172 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -517,4 +517,34 @@ struct perf_ctrl_desc { }; }; }; + +#define UTI_FLAG_NUMA_SET (1ULL<<1) /* Indicates NUMA_SET is specified */ + +#define UTI_FLAG_SAME_NUMA_DOMAIN (1ULL<<2) +#define UTI_FLAG_DIFFERENT_NUMA_DOMAIN (1ULL<<3) + +#define UTI_FLAG_SAME_L1 (1ULL<<4) +#define UTI_FLAG_SAME_L2 (1ULL<<5) +#define UTI_FLAG_SAME_L3 (1ULL<<6) + +#define UTI_FLAG_DIFFERENT_L1 (1ULL<<7) +#define UTI_FLAG_DIFFERENT_L2 (1ULL<<8) +#define UTI_FLAG_DIFFERENT_L3 (1ULL<<9) + +#define UTI_FLAG_EXCLUSIVE_CPU (1ULL<<10) +#define UTI_FLAG_CPU_INTENSIVE (1ULL<<11) +#define UTI_FLAG_HIGH_PRIORITY (1ULL<<12) +#define UTI_FLAG_NON_COOPERATIVE (1ULL<<13) + +/* Linux default value is used */ +#define UTI_MAX_NUMA_DOMAINS (1024) + +typedef struct uti_attr { + /* UTI_CPU_SET environmental variable is used to denote the preferred + location of utility thread */ + uint64_t numa_set[(UTI_MAX_NUMA_DOMAINS + sizeof(uint64_t) * 8 - 1) / + (sizeof(uint64_t) * 8)]; + uint64_t flags; /* Representing location and behavior hints by bitmap */ +} uti_attr_t; + #endif diff --git a/kernel/process.c b/kernel/process.c index baa5bce1..cdf83bd9 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -2864,11 +2864,16 @@ redo: } else { /* Pick a new running process or one that has a pending signal */ list_for_each_entry_safe(thread, tmp, &(v->runq), sched_list) { - if (thread->status == PS_RUNNING || - (thread->status == PS_INTERRUPTIBLE && hassigpending(thread))) { + if (thread->status == PS_RUNNING && + thread->mod_clone == SPAWNING_TO_REMOTE){ next = thread; break; } + if (thread->status == PS_RUNNING || + (thread->status == PS_INTERRUPTIBLE && hassigpending(thread))) { + if(!next) + next = thread; + } } /* No process? Run idle.. */ diff --git a/kernel/syscall.c b/kernel/syscall.c index 2a0e0d8e..b6b4ddd8 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -215,10 +215,11 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) dkprintf("%s: syscall num: %d waiting for Linux.. \n", __FUNCTION__, req->number); - + #define STATUS_IN_PROGRESS 0 #define STATUS_COMPLETED 1 #define STATUS_PAGE_FAULT 3 +#define STATUS_SYACALL 4 while (res.status != STATUS_COMPLETED) { while (res.status == STATUS_IN_PROGRESS) { struct cpu_local_var *v; @@ -290,6 +291,75 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING; send_syscall(&req2, cpu, pid, &res); } + + if (res.status == STATUS_SYACALL) { + struct syscall_request *requestp; + struct syscall_request request; + int num; + ihk_mc_user_context_t ctx; + int ns; + unsigned long syscall_ret; + unsigned long phys; + + phys = ihk_mc_map_memory(NULL, res.fault_address, + sizeof(struct syscall_request)); + requestp = ihk_mc_map_virtual(phys, 1, + PTATTR_WRITABLE | PTATTR_ACTIVE); + memcpy(&request, requestp, sizeof request); + ihk_mc_unmap_virtual(requestp, 1, 1); + ihk_mc_unmap_memory(NULL, phys, + sizeof(struct syscall_request)); + num = request.number; + + if (num == __NR_rt_sigaction) { + int sig = request.args[0]; + struct thread *thread = cpu_local_var(current); + + sig--; + if (sig < 0 || sig >= _NSIG) + syscall_ret = -EINVAL; + else + syscall_ret = (unsigned long)thread-> + sigcommon->action[sig]. + sa.sa_handler; + } + else { + ns = (sizeof syscall_table / + sizeof syscall_table[0]); + if (num >= 0 && num < ns && + syscall_table[num]) { + ihk_mc_syscall_arg0(&ctx) = + request.args[0]; + ihk_mc_syscall_arg1(&ctx) = + request.args[1]; + ihk_mc_syscall_arg2(&ctx) = + request.args[2]; + ihk_mc_syscall_arg3(&ctx) = + request.args[3]; + ihk_mc_syscall_arg4(&ctx) = + request.args[4]; + ihk_mc_syscall_arg5(&ctx) = + request.args[5]; + syscall_ret = syscall_table[num](num, + &ctx); + } + else + syscall_ret = -ENOSYS; + } + + /* send result */ + req2.number = __NR_mmap; +#define PAGER_RESUME_PAGE_FAULT 0x0101 + req2.args[0] = PAGER_RESUME_PAGE_FAULT; + req2.args[1] = syscall_ret; + /* The current thread is the requester and only the waiting thread + * may serve the request */ + req2.rtid = cpu_local_var(current)->tid; + req2.ttid = res.stid; + + res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING; + send_syscall(&req2, cpu, pid, &res); + } } dkprintf("%s: syscall num: %d got host reply: %d \n", @@ -299,6 +369,7 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) if(req->number != __NR_exit_group){ --thread->in_syscall_offload; +if(req->number == __NR_sched_setaffinity)kprintf("do_syscall 2 offload=%d\n", thread->in_syscall_offload); } /* -ERESTARTSYS indicates that the proxy process is gone @@ -941,15 +1012,16 @@ event_signal() } void -interrupt_syscall(int pid, int tid) +interrupt_syscall(struct thread *thread, int sig) { - dkprintf("interrupt_syscall,target pid=%d,target tid=%d\n", pid, tid); ihk_mc_user_context_t ctx; long lerror; - dkprintf("interrupt_syscall pid=%d tid=%d\n", pid, tid); - ihk_mc_syscall_arg0(&ctx) = pid; - ihk_mc_syscall_arg1(&ctx) = tid; + dkprintf("interrupt_syscall pid=%d tid=%d sig=%d\n", thread->proc->pid, + thread->tid, sig); + ihk_mc_syscall_arg0(&ctx) = thread->proc->pid; + ihk_mc_syscall_arg1(&ctx) = thread->tid; + ihk_mc_syscall_arg2(&ctx) = sig; lerror = syscall_generic_forwarding(__NR_kill, &ctx); if (lerror) { @@ -2044,6 +2116,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp, unsigned long cursp) { int cpuid; + int parent_cpuid; struct thread *old = cpu_local_var(current); struct process *oldproc = old->proc; struct process *newproc; @@ -2057,7 +2130,8 @@ unsigned long do_fork(int clone_flags, unsigned long newsp, dkprintf("do_fork(): stack_pointr passed in: 0x%lX, stack pointer of caller: 0x%lx\n", newsp, cursp); - + + parent_cpuid = old->cpu_id; if (((clone_flags & CLONE_VM) && !(clone_flags & CLONE_THREAD)) || (!(clone_flags & CLONE_VM) && (clone_flags & CLONE_THREAD))) { kprintf("clone(): ERROR: CLONE_VM and CLONE_THREAD should be set together\n"); @@ -2249,9 +2323,14 @@ retry_tid: new->tlsblock_base = old->tlsblock_base; } + new->parent_cpuid = parent_cpuid; + ihk_mc_syscall_ret(new->uctx) = 0; new->status = PS_RUNNING; + if (old->mod_clone == SPAWN_TO_REMOTE) { + new->mod_clone = SPAWNING_TO_REMOTE; + } chain_thread(new); if (!(clone_flags & CLONE_VM)) { newproc->status = PS_RUNNING; @@ -4800,14 +4879,16 @@ SYSCALL_DECLARE(futex) return ret; } -SYSCALL_DECLARE(exit) +static void +do_exit(int code) { struct thread *thread = cpu_local_var(current); struct thread *child; struct process *proc = thread->proc; struct mcs_rwlock_node_irqsave lock; int nproc; - int exit_status = (int)ihk_mc_syscall_arg0(ctx); + int exit_status = (code >> 8) & 255; + int sig = code & 255; dkprintf("sys_exit,pid=%d\n", proc->pid); @@ -4819,11 +4900,11 @@ SYSCALL_DECLARE(exit) mcs_rwlock_reader_unlock(&proc->threads_lock, &lock); if(nproc == 1){ // process has only one thread - terminate(exit_status, 0); + terminate(exit_status, sig); #ifdef ENABLE_RUSAGE rusage_num_threads--; #endif - return 0; + return; } #ifdef DCFA_KMOD @@ -4852,7 +4933,7 @@ SYSCALL_DECLARE(exit) #ifdef ENABLE_RUSAGE rusage_num_threads--; #endif - return 0; + return; } thread->status = PS_EXITED; sync_child_event(thread->proc->monitoring_event); @@ -4864,6 +4945,14 @@ SYSCALL_DECLARE(exit) rusage_num_threads--; #endif + return; +} + +SYSCALL_DECLARE(exit) +{ + int exit_status = (int)ihk_mc_syscall_arg0(ctx); + + do_exit(exit_status << 8); return 0; } @@ -6053,7 +6142,6 @@ SYSCALL_DECLARE(sched_setaffinity) struct thread *thread; int cpu_id; int empty_set = 1; - extern int num_processors; if (!u_cpu_set) { return -EFAULT; @@ -8412,6 +8500,123 @@ SYSCALL_DECLARE(pmc_reset) return ihk_mc_perfctr_reset(counter); } +extern void save_uctx(void *, void *); + +int +util_thread(struct uti_attr *arg) +{ + volatile unsigned long *context; + unsigned long pcontext; + struct syscall_request request IHK_DMA_ALIGN; + long rc; + struct thread *thread = cpu_local_var(current); + unsigned long free_address; + unsigned long free_size; + struct kuti_attr { + long parent_cpuid; + struct uti_attr attr; + } kattr; + +kprintf("util_thread called\n"); + context = (volatile unsigned long *)ihk_mc_alloc_pages(1, + IHK_MC_AP_NOWAIT); + if (!context) { + return -ENOMEM; + } + pcontext = virt_to_phys((void *)context); + save_uctx((void *)context, NULL); + + request.number = __NR_sched_setaffinity; + request.args[0] = 0; + request.args[1] = pcontext; + request.args[2] = 0; + if (arg) { + memcpy(&kattr.attr, arg, sizeof(struct uti_attr)); + kattr.parent_cpuid = thread->parent_cpuid; + request.args[2] = virt_to_phys(&kattr); + } + thread->thread_offloaded = 1; + rc = do_syscall(&request, ihk_mc_get_processor_id(), 0); + thread->thread_offloaded = 0; + free_address = context[0]; + free_size = context[1]; + ihk_mc_free_pages((void *)context, 1); + + if (rc >= 0) { + if (rc & 0x10000007f) { // exit_group || signal + thread->proc->nohost = 1; + terminate((rc >> 8) & 255, rc & 255); + } + else { + request.number = __NR_sched_setaffinity; + request.args[0] = 1; + request.args[1] = free_address; + request.args[2] = free_size; + do_syscall(&request, ihk_mc_get_processor_id(), 0); + do_exit(rc); + } + } + return rc; +} + +void +utilthr_migrate() +{ + struct thread *thread = cpu_local_var(current); + + if (thread->mod_clone == SPAWNING_TO_REMOTE) { + thread->mod_clone = SPAWN_TO_LOCAL; + util_thread(thread->mod_clone_arg); + } +} + +SYSCALL_DECLARE(util_migrate_inter_kernel) +{ + struct uti_attr *arg = (void *)ihk_mc_syscall_arg0(ctx); + struct uti_attr kattr; + + if (arg) { + if (copy_from_user(&kattr, arg, sizeof(struct uti_attr))) { + return -EFAULT; + } + } + + return util_thread(arg? &kattr: NULL); +} + +SYSCALL_DECLARE(util_indicate_clone) +{ + int mod = (int)ihk_mc_syscall_arg0(ctx); + struct uti_attr *arg = (void *)ihk_mc_syscall_arg1(ctx); + struct thread *thread = cpu_local_var(current); + struct uti_attr *kattr = NULL; + + if (mod != SPAWN_TO_LOCAL && + mod != SPAWN_TO_REMOTE) + return -EINVAL; + if (arg) { + kattr = kmalloc(sizeof(struct uti_attr), IHK_MC_AP_NOWAIT); + if (copy_from_user(kattr, arg, sizeof(struct uti_attr))) { + kfree(kattr); + return -EFAULT; + } + } + thread->mod_clone = mod; + if (thread->mod_clone_arg) { + kfree(thread->mod_clone_arg); + thread->mod_clone_arg = NULL; + } + if (kattr) { + thread->mod_clone_arg = kattr; + } + return 0; +} + +SYSCALL_DECLARE(get_system) +{ + return 0; +} + void reset_cputime() {