/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void terminate_mcexec(int, int); extern void ptrace_report_signal(struct thread *thread, int sig); extern void clear_single_step(struct thread *thread); void terminate(int, int); extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact); long syscall(int num, ihk_mc_user_context_t *ctx); extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); static void __check_signal(unsigned long rc, void *regs, int num, int irq_disabled); //#define DEBUG_PRINT_SC #ifdef DEBUG_PRINT_SC #undef DDEBUG_DEFAULT #define DDEBUG_DEFAULT DDEBUG_PRINT #endif #define NOT_IMPLEMENTED() do { kprintf("%s is not implemented\n", __func__); while(1);} while(0) uintptr_t debug_constants[] = { sizeof(struct cpu_local_var), offsetof(struct cpu_local_var, current), offsetof(struct cpu_local_var, runq), offsetof(struct cpu_local_var, status), offsetof(struct cpu_local_var, idle), offsetof(struct thread, ctx), offsetof(struct thread, sched_list), offsetof(struct thread, proc), offsetof(struct thread, status), offsetof(struct process, pid), offsetof(struct thread, tid), -1, }; extern int num_processors; int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last) { int min_queue_len = -1; int cpu, min_cpu = -1; #if 0 int uti_cpu = -1; #endif unsigned long irqstate = 0; int start, end, step; if (use_last) { start = num_processors - 1; end = -1; step = -1; } else { start = 0; end = num_processors; step = 1; } if (!cpu_local_var(current)->proc->nr_processes) { irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock); } else { irqstate = cpu_disable_interrupt_save(); } /* Find the first allowed core with the shortest run queue */ for (cpu = start; cpu != end; cpu += step) { struct cpu_local_var *v; if (!CPU_ISSET(cpu, cpu_set)) continue; v = get_cpu_local_var(cpu); ihk_mc_spinlock_lock_noirq(&v->runq_lock); dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d\n", __func__, cpu, v->runq_len, v->runq_reserved); if (min_queue_len == -1 || //v->runq_len + v->runq_reserved < min_queue_len) { v->runq_len < min_queue_len) { //min_queue_len = v->runq_len + v->runq_reserved; min_queue_len = v->runq_len; min_cpu = cpu; } #if 0 /* Record the last tie CPU */ if (min_cpu != cpu && v->runq_len + v->runq_reserved == min_queue_len) { uti_cpu = cpu; } dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d,min_cpu=%d,uti_cpu=%d\n", __func__, cpu, v->runq_len, v->runq_reserved, min_cpu, uti_cpu); #else ihk_mc_spinlock_unlock_noirq(&v->runq_lock); if (min_queue_len == 0) break; #endif } #if 0 min_cpu = use_last ? uti_cpu : min_cpu; if (min_cpu != -1) { if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED) get_cpu_local_var(min_cpu)->status = CPU_STATUS_RESERVED; __sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved, 1); } #else __sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved, 1); #endif if (!cpu_local_var(current)->proc->nr_processes) { ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate); } else { cpu_restore_interrupt(irqstate); } return min_cpu; } /* archtecture-depended syscall handlers */ extern unsigned long do_fork(int clone_flags, unsigned long newsp, unsigned long parent_tidptr, unsigned long child_tidptr, unsigned long tlsblock_base, unsigned long curpc, unsigned long cursp); SYSCALL_DECLARE(clone) { struct process *proc = cpu_local_var(current)->proc; struct mcs_rwlock_node_irqsave lock_dump; unsigned long ret; /* mutex coredump */ mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump); if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) { ret = do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx)); } else { ret = do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */ ihk_mc_syscall_arg1(ctx), /* newsp */ ihk_mc_syscall_arg2(ctx), /* parent_tidptr */ ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */ ihk_mc_syscall_arg3(ctx), /* tlsblock_base (swap arg4) */ ihk_mc_syscall_pc(ctx), /* curpc */ ihk_mc_syscall_sp(ctx)); /* cursp */ } mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump); return ret; } SYSCALL_DECLARE(prctl) { struct process *proc = cpu_local_var(current)->proc; int option = (int)ihk_mc_syscall_arg0(ctx); unsigned long arg2 = (unsigned long)ihk_mc_syscall_arg1(ctx); unsigned long arg3 = (unsigned long)ihk_mc_syscall_arg2(ctx); unsigned long arg4 = (unsigned long)ihk_mc_syscall_arg3(ctx); unsigned long arg5 = (unsigned long)ihk_mc_syscall_arg4(ctx); long error; switch (option) { case PR_SVE_SET_VL: error = SVE_SET_VL(ihk_mc_syscall_arg1(ctx)); break; case PR_SVE_GET_VL: error = SVE_GET_VL(); break; case PR_SET_THP_DISABLE: if (arg3 || arg4 || arg5) { return -EINVAL; } proc->thp_disable = arg2; error = 0; break; case PR_GET_THP_DISABLE: if (arg2 || arg3 || arg4 || arg5) { return -EINVAL; } error = proc->thp_disable; break; default: error = syscall_generic_forwarding(__NR_prctl, ctx); break; } return error; } /* * @ref.impl linux-linaro/src/linux-linaro/arch/arm64/kernel/signal.c::struct rt_sigframe * @ref.impl mckernel/arch/x86/kernel/syscall.c::struct sigsp */ struct sigsp { unsigned long sigrc; int syscallno; int restart; siginfo_t info; struct ucontext uc; uint64_t fp; uint64_t lr; }; struct rt_sigframe_user_layout { struct sigsp __user *usigframe; struct sigsp *ksigframe; unsigned long size; /* size of allocated sigframe data */ unsigned long limit; /* largest allowed size */ unsigned long fpsimd_offset; unsigned long esr_offset; unsigned long sve_offset; unsigned long extra_offset; unsigned long end_offset; }; static void preserve_fpsimd_context(struct fpsimd_context *ctx) { struct fpsimd_state fpsimd; /* dump the hardware registers to the fpsimd_state structure */ fpsimd_save_state(&fpsimd); /* copy the FP and status/control registers */ memcpy(ctx->vregs, fpsimd.vregs, sizeof(fpsimd.vregs)); ctx->fpsr = fpsimd.fpsr; ctx->fpcr = fpsimd.fpcr; /* copy the magic/size information */ ctx->head.magic = FPSIMD_MAGIC; ctx->head.size = sizeof(struct fpsimd_context); } /* @ref.impl arch/arm64/kernel/signal.c::preserve_sve_context */ static void preserve_sve_context(void *ctx) { struct sve_context *sve_ctx = ctx; unsigned int vl = current_thread_info()->sve_vl; unsigned int vq; unsigned int fpscr[2] = { 0, 0 }; BUG_ON(!sve_vl_valid(vl)); vq = sve_vq_from_vl(vl); /* sve_context header set */ sve_ctx->head.magic = SVE_MAGIC; sve_ctx->head.size = ALIGN_UP(SVE_SIG_CONTEXT_SIZE(vq), 16); /* sve_context vl set */ sve_ctx->vl = vl; /* sve_context reserved area 0 clear */ memset(sve_ctx->__reserved, 0, sizeof(sve_ctx->__reserved)); /* sve register save */ /* fpsr & fpcr discards, because already saved by preserve_fpsimd_context() */ sve_save_state(ctx + SVE_SIG_FFR_OFFSET(vq), fpscr); } static int restore_fpsimd_context(struct fpsimd_context *ctx) { struct fpsimd_state fpsimd; unsigned int magic, size; /* check the magic/size information */ magic = ctx->head.magic; size = ctx->head.size; if (magic != FPSIMD_MAGIC || size != sizeof(struct fpsimd_context)) return -EINVAL; // copy the FP and status/control registers memcpy(fpsimd.vregs, ctx->vregs, sizeof(fpsimd.vregs)); fpsimd.fpsr = ctx->fpsr; fpsimd.fpcr = ctx->fpcr; /* load the hardware registers from the fpsimd_state structure */ fpsimd_load_state(&fpsimd); return 0; } /* @ref.impl arch/arm64/kernel/signal.c::__restore_sve_fpsimd_context */ static int __restore_sve_fpsimd_context(void *ctx, unsigned int vq, struct fpsimd_context *fpsimd) { struct fpsimd_sve_state(vq) *sst = ctx + SVE_SIG_ZREGS_OFFSET; int i = 0; /* vq check */ if (vq != sve_vq_from_vl(current_thread_info()->sve_vl)) { return -EINVAL; } /* copy from fpsimd_context vregs */ for (i = 0; i < 32; i++) { sst->zregs[i][0] = fpsimd->vregs[i]; } /* restore sve register */ sve_load_state(sst->ffr, &fpsimd->fpsr, vq - 1); return 0; } /* @ref.impl arch/arm64/kernel/signal.c::restore_sve_fpsimd_context */ static int restore_sve_fpsimd_context(void *ctx, struct fpsimd_context *fpsimd) { struct sve_context const *sve_ctx = ctx; uint16_t vl = sve_ctx->vl; uint16_t vq; /* vl check */ if (!sve_vl_valid(vl)) { return -EINVAL; } vq = sve_vq_from_vl(vl); return __restore_sve_fpsimd_context(ctx, vq, fpsimd); } /* @ref.impl arch/arm64/kernel/signal.c::SIGFRAME_MAXSZ */ /* Sanity limit on the maximum size of signal frame we'll try to generate. */ /* This is NOT ABI. */ #define SIGFRAME_MAXSZ _SZ64KB /* @ref.impl arch/arm64/kernel/signal.c::BUILD_BUG_ON in the __sigframe_alloc */ STATIC_ASSERT(SIGFRAME_MAXSZ == ALIGN_DOWN(SIGFRAME_MAXSZ, 16)); STATIC_ASSERT(SIGFRAME_MAXSZ > ALIGN_UP(sizeof(struct _aarch64_ctx), 16)); STATIC_ASSERT(ALIGN_UP(sizeof(struct sigsp), 16) < SIGFRAME_MAXSZ - ALIGN_UP(sizeof(struct _aarch64_ctx), 16)); /* @ref.impl arch/arm64/kernel/signal.c::parse_user_sigframe */ static int parse_user_sigframe(struct sigsp *sf) { struct sigcontext *sc = &sf->uc.uc_mcontext; struct _aarch64_ctx *head; char *base = (char *)&sc->__reserved; size_t offset = 0; size_t limit = sizeof(sc->__reserved); int have_extra_context = 0, err = -EINVAL; void *kextra_data = NULL; struct fpsimd_context *fpsimd_ctx = NULL; struct sve_context *sve_ctx = NULL; if (ALIGN_UP((unsigned long)base, 16) != (unsigned long)base) goto invalid; while (1) { unsigned int magic, size; BUG_ON(limit < offset); if (limit - offset < sizeof(*head)) goto invalid; if (ALIGN_DOWN(offset, 16) != offset) goto invalid; BUG_ON(ALIGN_UP((unsigned long)base + offset, 16) != (unsigned long)base + offset); head = (struct _aarch64_ctx *)(base + offset); magic = head->magic; size = head->size; if (limit - offset < size) goto invalid; switch (magic) { case 0: if (size) goto invalid; goto done; case FPSIMD_MAGIC: if (fpsimd_ctx) goto invalid; if (size < sizeof(struct fpsimd_context)) goto invalid; fpsimd_ctx = container_of(head, struct fpsimd_context, head); break; case ESR_MAGIC: /* ignore */ break; case SVE_MAGIC: { struct sve_context *sve_head = container_of(head, struct sve_context, head); if (!(elf_hwcap & HWCAP_SVE)) goto invalid; if (sve_ctx) goto invalid; if (size < sizeof(*sve_ctx)) goto invalid; sve_ctx = sve_head; break; } /* SVE_MAGIC */ case EXTRA_MAGIC: { struct extra_context const *extra; void __user *extra_data; unsigned int extra_size; if (have_extra_context) goto invalid; if (size < sizeof(*extra)) goto invalid; extra = (struct extra_context const *)head; extra_data = extra->data; extra_size = extra->size; /* Prevent looping/repeated parsing of extra_conext */ have_extra_context = 1; kextra_data = kmalloc(extra_size + 15, IHK_MC_AP_NOWAIT); if (copy_from_user((char *)ALIGN_UP((unsigned long)kextra_data, 16), extra_data, extra_size)) { goto invalid; } /* * Rely on the __user accessors to reject bogus * pointers. */ base = (char *)ALIGN_UP((unsigned long)kextra_data, 16); if (ALIGN_UP((unsigned long)base, 16) != (unsigned long)base) goto invalid; /* Reject "unreasonably large" frames: */ limit = extra_size; if (limit > SIGFRAME_MAXSZ - sizeof(sc->__reserved)) goto invalid; /* * Ignore trailing terminator in __reserved[] * and start parsing extra_data: */ offset = 0; continue; } /* EXTRA_MAGIC */ default: goto invalid; } if (size < sizeof(*head)) goto invalid; if (limit - offset < size) goto invalid; offset += size; } done: if (!fpsimd_ctx) goto invalid; if (sve_ctx) { err = restore_sve_fpsimd_context(sve_ctx, fpsimd_ctx); } else { err = restore_fpsimd_context(fpsimd_ctx); } invalid: if (kextra_data) { kfree(kextra_data); kextra_data = NULL; } return err; } SYSCALL_DECLARE(rt_sigreturn) { int i, err = 0; struct thread *thread = cpu_local_var(current); ihk_mc_user_context_t *regs = ctx; struct sigsp ksigsp; struct sigsp __user *usigsp; siginfo_t info; /* * Since we stacked the signal on a 128-bit boundary, then 'sp' should * be word aligned here. */ if (regs->sp & 15) goto bad_frame; usigsp = (struct sigsp __user *)regs->sp; if (copy_from_user(&ksigsp, usigsp, sizeof(ksigsp))) { goto bad_frame; } for (i = 0; i < 31; i++) { regs->regs[i] = ksigsp.uc.uc_mcontext.regs[i]; } regs->sp = ksigsp.uc.uc_mcontext.sp; regs->pc = ksigsp.uc.uc_mcontext.pc; regs->pstate = ksigsp.uc.uc_mcontext.pstate; // Avoid sys_rt_sigreturn() restarting. regs->syscallno = ~0UL; err = parse_user_sigframe(&ksigsp); if (err) goto bad_frame; thread->sigmask.__val[0] = ksigsp.uc.uc_sigmask.__val[0]; thread->sigstack.ss_flags = ksigsp.uc.uc_stack.ss_flags; if(ksigsp.restart){ regs->orig_x0 = regs->regs[0]; regs->orig_pc = regs->pc; return syscall(ksigsp.syscallno, regs); } if (thread->ctx.thread->flags & (1 << TIF_SINGLESTEP)) { memset(&info, 0, sizeof(info)); info.si_code = TRAP_HWBKPT; regs->regs[0] = ksigsp.sigrc; clear_single_step(thread); set_signal(SIGTRAP, regs, &info); check_need_resched(); check_signal(0, regs, -1); } return ksigsp.sigrc; bad_frame: ekprintf("[pid:%d]: bad frame in %s: pc=%08llx sp=%08llx\n", thread->proc->pid, __FUNCTION__, regs->pc, regs->sp); memset(&info, 0, sizeof(info)); info.si_signo = SIGSEGV; info.si_code = SI_KERNEL; set_signal(info.si_signo, regs, &info); return 0; } extern struct cpu_local_var *clv; extern void interrupt_syscall(struct thread *, int sig); extern int num_processors; long alloc_debugreg(struct thread *thread) { struct user_hwdebug_state *hws = NULL; /* LOWER: breakpoint register area. */ /* HIGHER: watchpoint register area. */ hws = kmalloc(sizeof(struct user_hwdebug_state) * 2, IHK_MC_AP_NOWAIT); if (hws == NULL) { kprintf("alloc_debugreg: no memory.\n"); return -ENOMEM; } memset(hws, 0, sizeof(struct user_hwdebug_state) * 2); /* initialize dbg_info */ hws[HWS_BREAK].dbg_info = ptrace_hbp_get_resource_info(NT_ARM_HW_BREAK); hws[HWS_WATCH].dbg_info = ptrace_hbp_get_resource_info(NT_ARM_HW_WATCH); thread->ptrace_debugreg = (unsigned long *)hws; return 0; } void save_debugreg(unsigned long *debugreg) { struct user_hwdebug_state *hws = (struct user_hwdebug_state *)debugreg; int i = 0; /* save DBGBVR_EL1 and DBGBCR_EL1 (n=0-(core_num_brps-1)) */ for (i = 0; i < core_num_brps; i++) { hws[HWS_BREAK].dbg_regs[i].addr = read_wb_reg(AARCH64_DBG_REG_BVR, i); hws[HWS_BREAK].dbg_regs[i].ctrl = read_wb_reg(AARCH64_DBG_REG_BCR, i); } /* save DBGWVR_EL1 and DBGWCR_EL1 (n=0-(core_num_wrps-1)) */ for (i = 0; i < core_num_wrps; i++) { hws[HWS_WATCH].dbg_regs[i].addr = read_wb_reg(AARCH64_DBG_REG_WVR, i); hws[HWS_WATCH].dbg_regs[i].ctrl = read_wb_reg(AARCH64_DBG_REG_WCR, i); } } void restore_debugreg(unsigned long *debugreg) { struct user_hwdebug_state *hws = (struct user_hwdebug_state *)debugreg; unsigned int mdscr; int i = 0; /* set MDSCR_EL1.MDE */ mdscr = mdscr_read(); mdscr |= DBG_MDSCR_MDE; mdscr_write(mdscr); /* restore DBGBVR_EL1 and DBGBCR_EL1 (n=0-(core_num_brps-1)) */ for (i = 0; i < core_num_brps; i++) { write_wb_reg(AARCH64_DBG_REG_BVR, i, hws[HWS_BREAK].dbg_regs[i].addr); write_wb_reg(AARCH64_DBG_REG_BCR, i, hws[HWS_BREAK].dbg_regs[i].ctrl); } /* restore DBGWVR_EL1 and DBGWCR_EL1 (n=0-(core_num_wrps-1)) */ for (i = 0; i < core_num_wrps; i++) { write_wb_reg(AARCH64_DBG_REG_WVR, i, hws[HWS_WATCH].dbg_regs[i].addr); write_wb_reg(AARCH64_DBG_REG_WCR, i, hws[HWS_WATCH].dbg_regs[i].ctrl); } } void clear_debugreg(void) { unsigned int mdscr; /* clear DBGBVR_EL1 and DBGBCR_EL1 (n=0-(core_num_brps-1)) */ /* clear DBGWVR_EL1 and DBGWCR_EL1 (n=0-(core_num_wrps-1)) */ hw_breakpoint_reset(); /* clear MDSCR_EL1.MDE */ mdscr = mdscr_read(); mdscr &= ~DBG_MDSCR_MDE; mdscr_write(mdscr); } void clear_single_step(struct thread *thread) { clear_regs_spsr_ss(thread->uctx); thread->ctx.thread->flags &= ~(1 << TIF_SINGLESTEP); } void set_single_step(struct thread *thread) { thread->ctx.thread->flags |= (1 << TIF_SINGLESTEP); set_regs_spsr_ss(thread->uctx); } extern int coredump(struct thread *thread, void *regs, int sig); static int isrestart(int syscallno, unsigned long rc, int sig, int restart) { if (sig == SIGKILL || sig == SIGSTOP) return 0; if (syscallno < 0 || rc != -EINTR) return 0; if (sig == SIGCHLD) return 1; /* * The following interfaces are never restarted after being interrupted * by a signal handler, regardless of the use of SA_RESTART * Interfaces used to wait for signals: * pause(2), sigsuspend(2), sigtimedwait(2), and sigwaitinfo(2). * File descriptor multiplexing interfaces: * epoll_wait(2), epoll_pwait(2), poll(2), ppoll(2), select(2), and pselect(2). * System V IPC interfaces: * msgrcv(2), msgsnd(2), semop(2), and semtimedop(2). * Sleep interfaces: * clock_nanosleep(2), nanosleep(2), and usleep(3). * io_getevents(2). * * Note: following functions will issue another systemcall. * pause(2) -> rt_sigsuspend * epoll_wait(2) -> epoll_pwait * poll(2) -> ppoll * select(2) -> pselect6 */ switch (syscallno) { case __NR_rt_sigsuspend: case __NR_rt_sigtimedwait: case __NR_epoll_pwait: case __NR_ppoll: case __NR_pselect6: case __NR_msgrcv: case __NR_msgsnd: case __NR_semop: case __NR_semtimedop: case __NR_clock_nanosleep: case __NR_nanosleep: case __NR_io_getevents: return 0; } if (restart) return 1; return 0; } /* @ref.impl arch/arm64/kernel/signal.c::init_user_layout */ static void init_user_layout(struct rt_sigframe_user_layout *user) { const size_t __reserved_size = sizeof(user->usigframe->uc.uc_mcontext.__reserved); const size_t terminator_size = ALIGN_UP(sizeof(struct _aarch64_ctx), 16); memset(user, 0, sizeof *user); user->size = offsetof(struct sigsp, uc.uc_mcontext.__reserved); user->limit = user->size + (__reserved_size - terminator_size - sizeof(struct extra_context)); /* Reserve space for extension and terminator ^ */ BUG_ON(user->limit <= user->size); } /* @ref.impl arch/arm64/kernel/signal.c::sigframe_size */ static size_t sigframe_size(struct rt_sigframe_user_layout const *user) { size_t size; /* FIXME: take user->limit into account? */ if (user->size > sizeof(struct sigsp)) { size = user->size; } else { size = sizeof(struct sigsp); } return ALIGN_UP(size, 16); } /* @ref.impl arch/arm64/kernel/signal.c::__sigframe_alloc */ static int __sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size, unsigned char extend) { unsigned long padded_size = ALIGN_UP(size, 16); /* Sanity-check invariants */ BUG_ON(user->limit < user->size); BUG_ON(user->size != ALIGN_DOWN(user->size, 16)); BUG_ON(size < sizeof(struct _aarch64_ctx)); if (padded_size > user->limit - user->size && !user->extra_offset && extend) { int ret; ret = __sigframe_alloc(user, &user->extra_offset, sizeof(struct extra_context), 0); if (ret) { return ret; } /* * Further allocations must go after the fixed-size * part of the signal frame: */ user->size = ALIGN_UP(sizeof(struct sigsp), 16); /* * Allow expansion up to SIGFRAME_MAXSZ, ensuring space for * the terminator: */ user->limit = SIGFRAME_MAXSZ - ALIGN_UP(sizeof(struct _aarch64_ctx), 16); } /* Still not enough space? Bad luck! */ if (padded_size > user->limit - user->size) { return -ENOMEM; } /* Anti-leakage check: don't double-allocate the same block: */ BUG_ON(*offset); *offset = user->size; user->size += padded_size; /* Check invariants again */ BUG_ON(user->limit < user->size); BUG_ON(user->size != ALIGN_DOWN(user->size, 16)); return 0; } /* @ref.impl arch/arm64/kernel/signal.c::sigframe_alloc */ /* Allocate space for an optional record of bytes in the user * signal frame. The offset from the signal frame base address to the * allocated block is assigned to *offset. */ static int sigframe_alloc(struct rt_sigframe_user_layout *user, unsigned long *offset, size_t size) { return __sigframe_alloc(user, offset, size, 1); } /* @ref.impl arch/arm64/kernel/signal.c::sigframe_alloc_end */ /* Allocate the null terminator record and prevent further allocations */ static int sigframe_alloc_end(struct rt_sigframe_user_layout *user) { int ret; const size_t __reserved_size = sizeof(user->ksigframe->uc.uc_mcontext.__reserved); const size_t __reserved_offset = offsetof(struct sigsp, uc.uc_mcontext.__reserved); const size_t terminator_size = ALIGN_UP(sizeof(struct _aarch64_ctx), 16); if (user->extra_offset) { BUG_ON(user->limit != SIGFRAME_MAXSZ - terminator_size); } else { BUG_ON(user->limit != __reserved_offset + (__reserved_size - terminator_size - sizeof(struct extra_context))); } /* Un-reserve the space reserved for the terminator: */ user->limit += terminator_size; ret = sigframe_alloc(user, &user->end_offset, sizeof(struct _aarch64_ctx)); if (ret) { return ret; } /* Prevent further allocation: */ user->limit = user->size; return 0; } /* @ref.impl arch/arm64/kernel/signal.c::apply_user_offset */ /* changed McKernel, void *p and return value is kernel area address, function name */ static void *get_sigframe_context_kaddr( struct rt_sigframe_user_layout const *user, unsigned long offset) { char *base = (char *)user->ksigframe; BUG_ON(!base); BUG_ON(!offset); /* * TODO: sanity-check that the result is within appropriate bounds * (should be ensured by the use of set_user_offset() to compute * all offsets. */ return base + offset; } /* @ref.impl arch/arm64/kernel/signal.c::apply_user_offset */ /* changed McKernel, function name */ static void __user *get_sigframe_context_uaddr( struct rt_sigframe_user_layout const *user, unsigned long offset) { char __user *base = (char __user *)user->usigframe; BUG_ON(!base); BUG_ON(!offset); /* * TODO: sanity-check that the result is within appropriate bounds * (should be ensured by the use of set_user_offset() to compute * all offsets. */ return base + offset; } /* @ref.impl arch/arm64/kernel/signal.c::setup_sigframe_layout */ /* Determine the layout of optional records in the signal frame */ static int setup_sigframe_layout(struct rt_sigframe_user_layout *user) { int err; err = sigframe_alloc(user, &user->fpsimd_offset, sizeof(struct fpsimd_context)); if (err) return err; /* fault information, if valid */ if (current_thread_info()->fault_code) { err = sigframe_alloc(user, &user->esr_offset, sizeof(struct esr_context)); if (err) return err; } if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) { if (likely(elf_hwcap & HWCAP_SVE)) { unsigned int vq = sve_vq_from_vl(current_thread_info()->sve_vl); err = sigframe_alloc(user, &user->sve_offset, SVE_SIG_CONTEXT_SIZE(vq)); if (err) return err; } } return sigframe_alloc_end(user); } /* @ref.impl arch/arm64/kernel/signal.c::get_sigframe */ static int get_sigframe(struct thread *thread, struct rt_sigframe_user_layout *user, struct pt_regs *regs, unsigned long sa_flags) { unsigned long sp, sp_top, frame_size; int err; init_user_layout(user); // get signal frame if ((sa_flags & SA_ONSTACK) && !(thread->sigstack.ss_flags & SS_DISABLE) && !(thread->sigstack.ss_flags & SS_ONSTACK)) { unsigned long lsp; lsp = ((unsigned long)(((char *)thread->sigstack.ss_sp) + thread->sigstack.ss_size)) & ~15UL; sp = sp_top = lsp; thread->sigstack.ss_flags |= SS_ONSTACK; } else { sp = sp_top = regs->sp; } sp = ALIGN_DOWN(sp, 16); /* calc sigframe layout */ err = setup_sigframe_layout(user); if (err) return err; /* calc new user stack pointer */ frame_size = sigframe_size(user); sp -= frame_size; BUG_ON(ALIGN_DOWN(sp, 16) != sp); /* set user sp address and kernel sigframe address */ user->usigframe = (struct sigsp __user *)sp; return 0; } /* @ref.impl arch/arm64/kernel/signal.c::setup_rt_frame */ static int setup_rt_frame(int usig, unsigned long rc, int to_restart, int syscallno, struct k_sigaction *k, struct sig_pending *pending, struct pt_regs *regs, struct thread *thread) { struct rt_sigframe_user_layout user; struct sigsp *kframe; struct sigsp __user *uframe; int i = 0, err = 0, kpages = 0; struct _aarch64_ctx *end; /* get signal frame info */ memset(&user, 0, sizeof(user)); if (get_sigframe(thread, &user, regs, k->sa.sa_flags)) { return 1; } /* allocate kernel sigframe buffer */ kpages = (sigframe_size(&user) + PAGE_SIZE - 1) >> PAGE_SHIFT; user.ksigframe = ihk_mc_alloc_pages(kpages, IHK_MC_AP_NOWAIT); /* set kernel sigframe lowest addr */ kframe = user.ksigframe; /* set user sigframe lowest addr */ uframe = user.usigframe; // init non use data. kframe->uc.uc_flags = 0; kframe->uc.uc_link = NULL; // save alternate stack infomation. kframe->uc.uc_stack.ss_sp = uframe; kframe->uc.uc_stack.ss_flags = thread->sigstack.ss_size; kframe->uc.uc_stack.ss_size = thread->sigstack.ss_flags; // save signal frame. kframe->fp = regs->regs[29]; kframe->lr = regs->regs[30]; kframe->sigrc = rc; for (i = 0; i < 31; i++) { kframe->uc.uc_mcontext.regs[i] = regs->regs[i]; } kframe->uc.uc_mcontext.sp = regs->sp; kframe->uc.uc_mcontext.pc = regs->pc; kframe->uc.uc_mcontext.pstate = regs->pstate; kframe->uc.uc_mcontext.fault_address = current_thread_info()->fault_address; kframe->uc.uc_sigmask = thread->sigmask; // save fp simd context. preserve_fpsimd_context(get_sigframe_context_kaddr(&user, user.fpsimd_offset)); if (user.esr_offset) { // save esr context. struct esr_context *esr_ctx = get_sigframe_context_kaddr(&user, user.esr_offset); esr_ctx->head.magic = ESR_MAGIC; esr_ctx->head.size = sizeof(*esr_ctx); esr_ctx->esr = current_thread_info()->fault_code; } if (user.sve_offset) { // save sve context. struct sve_context *sve_ctx = get_sigframe_context_kaddr(&user, user.sve_offset); preserve_sve_context(sve_ctx); } if (user.extra_offset) { // save extra context. struct extra_context *extra = get_sigframe_context_kaddr(&user, user.extra_offset); struct _aarch64_ctx *end = (struct _aarch64_ctx *)((char *)extra + ALIGN_UP(sizeof(*extra), 16)); void __user *extra_data = get_sigframe_context_uaddr(&user, ALIGN_UP(sizeof(struct sigsp), 16)); unsigned int extra_size = ALIGN_UP(user.size, 16) - ALIGN_UP(sizeof(struct sigsp), 16); /* * ^ FIXME: bounds sanity-checks: both of these should fit * within __reserved! */ extra->head.magic = EXTRA_MAGIC; extra->head.size = sizeof(*extra); extra->data = extra_data; extra->size = extra_size; /* Add the terminator */ end->magic = 0; end->size = 0; } // set the "end" magic end = get_sigframe_context_kaddr(&user, user.end_offset); end->magic = 0; end->size = 0; // save syscall infomation to restart. kframe->syscallno = syscallno; kframe->restart = to_restart; /* set sig handler context */ // set restart context regs->regs[0] = usig; regs->sp = (unsigned long)uframe; regs->regs[29] = (unsigned long)&uframe->fp; regs->pc = (unsigned long)k->sa.sa_handler; if (k->sa.sa_flags & SA_RESTORER){ regs->regs[30] = (unsigned long)k->sa.sa_restorer; #ifdef ENABLE_FUGAKU_HACKS kprintf("%s: SA_RESTORER: 0x%lx\n", __func__, regs->regs[30]); #endif } else { regs->regs[30] = (unsigned long)VDSO_SYMBOL(thread->vm->vdso_addr, sigtramp); } if(k->sa.sa_flags & SA_SIGINFO){ kframe->info = pending->info; regs->regs[1] = (unsigned long)&uframe->info; regs->regs[2] = (unsigned long)&uframe->uc; } /* copy to user sigframe */ err = copy_to_user(user.usigframe, user.ksigframe, sigframe_size(&user)); /* free kernel sigframe buffer */ ihk_mc_free_pages(user.ksigframe, kpages); return err; } int do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pending *pending, int syscallno) { struct pt_regs *regs = regs0; struct k_sigaction *k; int sig; __sigset_t w; struct process *proc = thread->proc; int orgsig; int ptraceflag = 0; struct mcs_rwlock_node_irqsave lock; struct mcs_rwlock_node_irqsave mcs_rw_node; int restart = 0; int ret; for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1); dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig); orgsig = sig; if ((thread->ptrace & PT_TRACED) && pending->ptracecont == 0 && sig != SIGKILL) { ptraceflag = 1; sig = SIGSTOP; } if(regs == NULL){ /* call from syscall */ regs = thread->uctx; /* * Call do_signal() directly syscalls, * need to save the return value. */ if (rc == -EINTR) { if (regs->syscallno == __NR_rt_sigtimedwait || regs->syscallno == __NR_rt_sigsuspend) { regs->regs[0] = rc; } } } else{ rc = regs->regs[0]; } mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node); k = thread->sigcommon->action + sig - 1; if(k->sa.sa_handler == SIG_IGN){ kfree(pending); mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node); goto out; } else if(k->sa.sa_handler){ // check syscall to have restart ? restart = isrestart(syscallno, rc, sig, k->sa.sa_flags & SA_RESTART); if (restart == 1) { /* Prepare for system call restart. */ regs->regs[0] = regs->orig_x0; } if (setup_rt_frame(sig, rc, restart, syscallno, k, pending, regs, thread)) { kfree(pending); mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node); kprintf("do_signal,page_fault_thread_vm failed\n"); terminate(0, sig); goto out; } // check signal handler is ONESHOT if(k->sa.sa_flags & SA_RESETHAND) { k->sa.sa_handler = SIG_DFL; } if(!(k->sa.sa_flags & SA_NODEFER)) thread->sigmask.__val[0] |= pending->sigmask.__val[0]; kfree(pending); mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node); if (thread->ctx.thread->flags & (1 << TIF_SINGLESTEP)) { siginfo_t info = { .si_code = TRAP_HWBKPT, }; clear_single_step(thread); set_signal(SIGTRAP, regs, &info); check_need_resched(); check_signal(0, regs, -1); } } else { int coredumped = 0; siginfo_t info; int ptc = pending->ptracecont; if(ptraceflag){ if(thread->ptrace_recvsig) kfree(thread->ptrace_recvsig); thread->ptrace_recvsig = pending; if(thread->ptrace_sendsig) kfree(thread->ptrace_sendsig); thread->ptrace_sendsig = NULL; } else kfree(pending); mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node); switch (sig) { case SIGSTOP: case SIGTSTP: case SIGTTIN: case SIGTTOU: if(ptraceflag){ ptrace_report_signal(thread, orgsig); } else{ memset(&info, '\0', sizeof info); info.si_signo = SIGCHLD; info.si_code = CLD_STOPPED; info._sifields._sigchld.si_pid = thread->proc->pid; info._sifields._sigchld.si_status = (sig << 8) | 0x7f; if (ptc == 2 && thread != thread->proc->main_thread) { thread->signal_flags = SIGNAL_STOP_STOPPED; thread->status = PS_STOPPED; thread->exit_status = SIGSTOP; do_kill(thread, thread->report_proc->pid, -1, SIGCHLD, &info, 0); waitq_wakeup( &thread->report_proc->waitpid_q); } else { /* Update thread state in fork tree */ mcs_rwlock_writer_lock( &proc->update_lock, &lock); proc->group_exit_status = SIGSTOP; /* Reap and set new signal_flags */ proc->main_thread->signal_flags = SIGNAL_STOP_STOPPED; proc->status = PS_DELAY_STOPPED; thread->status = PS_STOPPED; mcs_rwlock_writer_unlock( &proc->update_lock, &lock); do_kill(thread, thread->proc->parent->pid, -1, SIGCHLD, &info, 0); } /* Sleep */ schedule(); dkprintf("SIGSTOP(): woken up\n"); } break; case SIGTRAP: dkprintf("do_signal,SIGTRAP\n"); if (!(thread->ptrace & PT_TRACED)) { goto core; } /* Update thread state in fork tree */ thread->exit_status = SIGTRAP; thread->status = PS_TRACED; if (thread == proc->main_thread) { mcs_rwlock_writer_lock(&proc->update_lock, &lock); proc->group_exit_status = SIGTRAP; proc->status = PS_DELAY_TRACED; mcs_rwlock_writer_unlock(&proc->update_lock, &lock); do_kill(thread, thread->proc->parent->pid, -1, SIGCHLD, &info, 0); } else { do_kill(thread, thread->report_proc->pid, -1, SIGCHLD, &info, 0); waitq_wakeup(&thread->report_proc->waitpid_q); } /* Sleep */ dkprintf("do_signal,SIGTRAP,sleeping\n"); schedule(); dkprintf("SIGTRAP(): woken up\n"); break; case SIGCONT: break; case SIGQUIT: case SIGILL: case SIGABRT: case SIGFPE: case SIGSEGV: case SIGBUS: case SIGSYS: case SIGXCPU: case SIGXFSZ: core: thread->coredump_regs = kmalloc(sizeof(struct pt_regs), IHK_MC_AP_NOWAIT); if (!thread->coredump_regs) { kprintf("%s: Out of memory\n", __func__); goto skip; } memcpy(thread->coredump_regs, regs, sizeof(struct pt_regs)); ret = coredump(thread, regs, sig); switch (ret) { case -EBUSY: kprintf("%s: INFO: coredump not performed, try ulimit -c \n", __func__); break; case 0: coredumped = 0x80; break; default: kprintf("%s: ERROR: coredump failed (%d)\n", __func__, ret); break; } skip: terminate(0, sig | coredumped); break; case SIGCHLD: case SIGURG: case SIGWINCH: break; default: dkprintf("do_signal,default,terminate,sig=%d\n", sig); terminate(0, sig); break; } } out: return restart; } int interrupt_from_user(void *regs0) { struct pt_regs *regs = regs0; return((regs->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t); } void save_syscall_return_value(int num, unsigned long rc) { const struct thread *thread = cpu_local_var(current); /* * Save syscall return value. */ if (thread && thread->uctx && ((thread->uctx->regs[0] == thread->uctx->orig_x0) && (thread->uctx->pc == thread->uctx->orig_pc))) { thread->uctx->regs[0] = rc; } } unsigned long do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int ptracecont) { dkprintf("do_kill,pid=%d,tid=%d,sig=%d\n", pid, tid, sig); struct thread *t; struct process *tproc; struct process *proc = thread? thread->proc: NULL; struct thread *tthread = NULL; int i; __sigset_t mask; mcs_rwlock_lock_t *savelock = NULL; struct mcs_rwlock_node mcs_rw_node; struct list_head *head = NULL; int rc; unsigned long irqstate = 0; int doint; int found = 0; siginfo_t info0; struct resource_set *rset = cpu_local_var(resource_set); int hash; struct thread_hash *thash = rset->thread_hash; struct process_hash *phash = rset->process_hash; struct mcs_rwlock_node lock; struct mcs_rwlock_node updatelock; struct sig_pending *pending = NULL; if(sig > SIGRTMAX || sig < 0) return -EINVAL; if(info == NULL){ memset(&info0, '\0', sizeof info0); info = &info0; info0.si_signo = sig; info0.si_code = SI_KERNEL; } if(tid == -1 && pid <= 0){ struct process *p; struct mcs_rwlock_node_irqsave slock; int pgid = -pid; int rc = -ESRCH; int *pids; int n = 0; int sendme = 0; if(pid == 0){ if(thread == NULL || thread->proc->pid <= 0) return -ESRCH; pgid = thread->proc->pgid; } pids = kmalloc(sizeof(int) * num_processors, IHK_MC_AP_NOWAIT); if(!pids) return -ENOMEM; for(i = 0; i < HASH_SIZE; i++){ mcs_rwlock_reader_lock(&phash->lock[i], &slock); list_for_each_entry(p, &phash->list[i], hash_list){ if(pgid != 1 && p->pgid != pgid) continue; if(thread && p->pid == thread->proc->pid){ sendme = 1; continue; } pids[n] = p->pid; n++; } mcs_rwlock_reader_unlock(&phash->lock[i], &slock); } for(i = 0; i < n; i++) rc = do_kill(thread, pids[i], -1, sig, info, ptracecont); if(sendme) rc = do_kill(thread, thread->proc->pid, -1, sig, info, ptracecont); kfree(pids); return rc; } irqstate = cpu_disable_interrupt_save(); mask = __sigmask(sig); if(tid == -1){ struct thread *tthread0 = NULL; struct mcs_rwlock_node plock; struct mcs_rwlock_node updatelock; found = 0; hash = process_hash(pid); mcs_rwlock_reader_lock_noirq(&phash->lock[hash], &plock); list_for_each_entry(tproc, &phash->list[hash], hash_list){ if(tproc->pid == pid){ found = 1; break; } } if(!found){ mcs_rwlock_reader_unlock_noirq(&phash->lock[hash], &plock); cpu_restore_interrupt(irqstate); return -ESRCH; } mcs_rwlock_reader_lock_noirq(&tproc->update_lock, &updatelock); if(tproc->status == PS_EXITED || tproc->status == PS_ZOMBIE){ goto done; } mcs_rwlock_reader_lock_noirq(&tproc->threads_lock, &lock); list_for_each_entry(t, &tproc->threads_list, siblings_list){ if(t->tid == pid || tthread == NULL){ if(t->status == PS_EXITED){ continue; } if(!(mask & t->sigmask.__val[0])){ tthread = t; found = 1; } else if(tthread == NULL && tthread0 == NULL){ tthread0 = t; found = 1; } } } if(tthread == NULL){ tthread = tthread0; } if(tthread && tthread->status != PS_EXITED){ savelock = &tthread->sigcommon->lock; head = &tthread->sigcommon->sigpending; hold_thread(tthread); } else tthread = NULL; mcs_rwlock_reader_unlock_noirq(&tproc->threads_lock, &lock); done: mcs_rwlock_reader_unlock_noirq(&tproc->update_lock, &updatelock); mcs_rwlock_reader_unlock_noirq(&phash->lock[hash], &plock); } else{ found = 0; hash = thread_hash(tid); mcs_rwlock_reader_lock_noirq(&thash->lock[hash], &lock); list_for_each_entry(tthread, &thash->list[hash], hash_list){ if(pid != -1 && tthread->proc->pid != pid){ continue; } if (tthread->tid == tid && tthread->status != PS_EXITED) { found = 1; break; } } if(!found){ mcs_rwlock_reader_unlock_noirq(&thash->lock[hash], &lock); cpu_restore_interrupt(irqstate); return -ESRCH; } tproc = tthread->proc; mcs_rwlock_reader_lock_noirq(&tproc->update_lock, &updatelock); savelock = &tthread->sigpendinglock; head = &tthread->sigpending; mcs_rwlock_reader_lock_noirq(&tproc->threads_lock, &lock); if (tthread->status != PS_EXITED && (sig == SIGKILL || (tproc->status != PS_EXITED && tproc->status != PS_ZOMBIE))) { if ((rc = hold_thread(tthread))) { kprintf("%s: ERROR hold_thread returned %d,tid=%d\n", __func__, rc, tthread->tid); tthread = NULL; } } else{ tthread = NULL; } mcs_rwlock_reader_unlock_noirq(&tproc->threads_lock, &lock); mcs_rwlock_reader_unlock_noirq(&tproc->update_lock, &updatelock); mcs_rwlock_reader_unlock_noirq(&thash->lock[hash], &lock); } if(sig != SIGCONT && proc && proc->euid != 0 && proc->ruid != tproc->ruid && proc->euid != tproc->ruid && proc->ruid != tproc->suid && proc->euid != tproc->suid){ if(tthread) release_thread(tthread); cpu_restore_interrupt(irqstate); return -EPERM; } if(sig == 0 || tthread == NULL || tthread->status == PS_EXITED){ if(tthread) release_thread(tthread); cpu_restore_interrupt(irqstate); return 0; } if (tthread->uti_state == UTI_STATE_RUNNING_IN_LINUX) { if (!tthread->proc->nohost) { interrupt_syscall(tthread, sig); } release_thread(tthread); return 0; } doint = 0; mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node); rc = 0; if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX list_for_each_entry(pending, head, list) { if (pending->sigmask.__val[0] == mask && pending->ptracecont == ptracecont) break; } if (&pending->list == head) pending = NULL; } if (pending == NULL) { doint = 1; pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT); if (!pending) { rc = -ENOMEM; } else { memset(pending, 0, sizeof(struct sig_pending)); pending->sigmask.__val[0] = mask; memcpy(&pending->info, info, sizeof(siginfo_t)); pending->ptracecont = ptracecont; if (sig == SIGKILL || sig == SIGSTOP) list_add(&pending->list, head); else list_add_tail(&pending->list, head); tthread->sigevent = 1; } } mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node); cpu_restore_interrupt(irqstate); if (sig == SIGCONT || ptracecont == 1) { /* Wake up the target only when stopped by SIGSTOP */ if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) { struct siginfo info; tthread->proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED; tthread->proc->status = PS_RUNNING; memset(&info, '\0', sizeof(info)); info.si_signo = SIGCHLD; info.si_code = CLD_CONTINUED; info._sifields._sigchld.si_pid = tthread->proc->pid; info._sifields._sigchld.si_status = 0x0000ffff; do_kill(tthread, tthread->proc->parent->pid, -1, SIGCHLD, &info, 0); if (thread != tthread) { ihk_mc_interrupt_cpu(tthread->cpu_id, ihk_mc_get_vector(IHK_GV_IKC)); } doint = 0; } } if (doint && !(mask & tthread->sigmask.__val[0])) { int status = tthread->status; if (thread != tthread) { dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n", tproc->pid, tthread->cpu_id); ihk_mc_interrupt_cpu(tthread->cpu_id, INTRID_CPU_NOTIFY); } if (status != PS_RUNNING) { if(sig == SIGKILL){ /* Wake up the target only when stopped by ptrace-reporting */ sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE); } else { sched_wakeup_thread(tthread, PS_INTERRUPTIBLE); } } } release_thread(tthread); return rc; } void set_signal(int sig, void *regs0, siginfo_t *info) { ihk_mc_user_context_t *regs = regs0; struct thread *thread = cpu_local_var(current); if (thread == NULL || thread->proc->pid == 0) return; if (!interrupt_from_user(regs)) { ihk_mc_debug_show_interrupt_context(regs); panic("panic: kernel mode signal"); } if ((__sigmask(sig) & thread->sigmask.__val[0])) { coredump(thread, regs0, sig); terminate(0, sig | 0x80); } do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0); } SYSCALL_DECLARE(mmap) { const unsigned int supported_flags = 0 | MAP_SHARED // 0x01 | MAP_PRIVATE // 0x02 | MAP_FIXED // 0x10 | MAP_ANONYMOUS // 0x20 | MAP_LOCKED // 0x2000 | MAP_POPULATE // 0x8000 | MAP_HUGETLB // 00040000 | (0x3FU << MAP_HUGE_SHIFT) // FC000000 ; const int ignored_flags = 0 | MAP_DENYWRITE // 0x0800 | MAP_NORESERVE // 0x4000 | MAP_STACK // 0x20000 ; const int error_flags = 0 | MAP_GROWSDOWN // 0x0100 | MAP_EXECUTABLE // 0x1000 | MAP_NONBLOCK // 0x10000 ; const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx); size_t len0 = ihk_mc_syscall_arg1(ctx); const int prot = ihk_mc_syscall_arg2(ctx); const int flags0 = ihk_mc_syscall_arg3(ctx); const int fd = ihk_mc_syscall_arg4(ctx); const off_t off0 = ihk_mc_syscall_arg5(ctx); struct thread *thread = cpu_local_var(current); struct vm_regions *region = &thread->vm->region; int error; uintptr_t addr = 0; size_t len; int flags = flags0; size_t pgsize; dkprintf("sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", addr0, len0, prot, flags0, fd, off0); /* check constants for flags */ if (1) { int dup_flags; dup_flags = (supported_flags & ignored_flags); dup_flags |= (ignored_flags & error_flags); dup_flags |= (error_flags & supported_flags); if (dup_flags) { ekprintf("sys_mmap:duplicate flags: %lx\n", dup_flags); ekprintf("s-flags: %08x\n", supported_flags); ekprintf("i-flags: %08x\n", ignored_flags); ekprintf("e-flags: %08x\n", error_flags); panic("sys_mmap:duplicate flags\n"); /* no return */ } } /* check arguments */ pgsize = PAGE_SIZE; #ifndef ENABLE_FUGAKU_HACKS if (flags & MAP_HUGETLB) { int hugeshift = flags & (0x3F << MAP_HUGE_SHIFT); /* OpenMPI expects -EINVAL when trying to map * /dev/shm/ file with MAP_SHARED | MAP_HUGETLB */ if (!(flags & MAP_ANONYMOUS)) { error = -EINVAL; goto out; } if (hugeshift == 0) { /* default hugepage size */ flags |= ihk_mc_get_linux_default_huge_page_shift() << MAP_HUGE_SHIFT; } else if ((first_level_block_support && hugeshift == MAP_HUGE_FIRST_BLOCK) || (first_level_block_support && hugeshift == MAP_HUGE_FIRST_CONT_BLOCK) || hugeshift == MAP_HUGE_SECOND_BLOCK || hugeshift == MAP_HUGE_SECOND_CONT_BLOCK || hugeshift == MAP_HUGE_THIRD_CONT_BLOCK) { /*nop*/ } else { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):" "not supported page size.\n", addr0, len0, prot, flags0, fd, off0); error = -EINVAL; goto out; } pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F); /* Round-up map length by pagesize */ len0 = ALIGN(len0, pgsize); if (rusage_check_overmap(len0, (flags >> MAP_HUGE_SHIFT) & 0x3F)) { error = -ENOMEM; goto out; } } #else if (flags & MAP_HUGETLB) { flags &= ~(MAP_HUGETLB); } #endif #define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1)) addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR; len = (len0 + pgsize - 1) & ~(pgsize - 1); if ((addr & (pgsize - 1)) || (len == 0) || !(flags & (MAP_SHARED | MAP_PRIVATE)) || ((flags & MAP_SHARED) && (flags & MAP_PRIVATE)) || (off0 & (pgsize - 1))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n", addr0, len0, prot, flags0, fd, off0); error = -EINVAL; goto out; } if (addr < region->user_start || region->user_end <= addr || len > (region->user_end - region->user_start)) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n", addr0, len0, prot, flags0, fd, off0); error = -ENOMEM; goto out; } /* check not supported requests */ if ((flags & error_flags) || (flags & ~(supported_flags | ignored_flags))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):unknown flags %x\n", addr0, len0, prot, flags0, fd, off0, (flags & ~(supported_flags | ignored_flags))); error = -EINVAL; goto out; } addr = do_mmap(addr, len, prot, flags, fd, off0, 0, NULL); error = 0; out: dkprintf("sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", addr0, len0, prot, flags0, fd, off0, error, addr); return (!error)? addr: error; } SYSCALL_DECLARE(shmget) { const key_t key = ihk_mc_syscall_arg0(ctx); const size_t size = ihk_mc_syscall_arg1(ctx); const int shmflg0 = ihk_mc_syscall_arg2(ctx); int shmid = -EINVAL; int error; int shmflg = shmflg0; dkprintf("shmget(%#lx,%#lx,%#x)\n", key, size, shmflg0); if (shmflg & SHM_HUGETLB) { int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT); if (hugeshift == 0) { /* default hugepage size */ shmflg |= ihk_mc_get_linux_default_huge_page_shift() << MAP_HUGE_SHIFT; } else if ((first_level_block_support && hugeshift == SHM_HUGE_FIRST_BLOCK) || (first_level_block_support && hugeshift == SHM_HUGE_FIRST_CONT_BLOCK) || hugeshift == SHM_HUGE_SECOND_BLOCK || hugeshift == SHM_HUGE_SECOND_CONT_BLOCK || hugeshift == SHM_HUGE_THIRD_CONT_BLOCK) { /*nop*/ } else { error = -EINVAL; goto out; } } shmid = do_shmget(key, size, shmflg); error = 0; out: dkprintf("shmget(%#lx,%#lx,%#x): %d %d\n", key, size, shmflg0, error, shmid); return (error)?: shmid; } /* sys_shmget() */ void save_uctx(void *uctx, struct pt_regs *regs) { struct trans_uctx { volatile int cond; int fregsize; struct user_pt_regs regs; unsigned long tls_baseaddr; } *ctx = uctx; if (!regs) { regs = current_pt_regs(); } ctx->cond = 0; ctx->fregsize = 0; ctx->regs = regs->user_regs; asm volatile( " mrs %0, tpidr_el0" : "=r" (ctx->tls_baseaddr)); } int do_process_vm_read_writev(int pid, const struct iovec *local_iov, unsigned long liovcnt, const struct iovec *remote_iov, unsigned long riovcnt, unsigned long flags, int op) { int ret = -EINVAL; int li, ri; int pli, pri; off_t loff, roff; size_t llen = 0, rlen = 0; size_t copied = 0; size_t to_copy; struct thread *lthread = cpu_local_var(current); struct process *rproc; struct process *lproc = lthread->proc; struct process_vm *rvm = NULL; unsigned long lphys, rphys; unsigned long lpage_left, rpage_left; unsigned long lpsize, rpsize; void *rva, *lva; #if 0 struct vm_range *range; #endif struct mcs_rwlock_node_irqsave lock; struct mcs_rwlock_node update_lock; /* Sanity checks */ if (flags) { return -EINVAL; } if (liovcnt > IOV_MAX || riovcnt > IOV_MAX) { return -EINVAL; } #if 0 /* Check if parameters are okay */ ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock); range = lookup_process_memory_range(lthread->vm, (uintptr_t)local_iov, (uintptr_t)(local_iov + liovcnt)); if (!range) { ret = -EFAULT; goto arg_out; } range = lookup_process_memory_range(lthread->vm, (uintptr_t)remote_iov, (uintptr_t)(remote_iov + riovcnt)); if (!range) { ret = -EFAULT; goto arg_out; } ret = 0; arg_out: ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock); if (ret != 0) { goto out; } #endif for (li = 0; li < liovcnt; ++li) { llen += local_iov[li].iov_len; dkprintf("local_iov[%d].iov_base: 0x%lx, len: %lu\n", li, local_iov[li].iov_base, local_iov[li].iov_len); } for (ri = 0; ri < riovcnt; ++ri) { rlen += remote_iov[ri].iov_len; dkprintf("remote_iov[%d].iov_base: 0x%lx, len: %lu\n", ri, remote_iov[ri].iov_base, remote_iov[ri].iov_len); } if (llen != rlen) { return -EINVAL; } /* Find remote process */ rproc = find_process(pid, &lock); if (!rproc) { ret = -ESRCH; goto out; } mcs_rwlock_reader_lock_noirq(&rproc->update_lock, &update_lock); if(rproc->status == PS_EXITED || rproc->status == PS_ZOMBIE){ mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock); process_unlock(rproc, &lock); ret = -ESRCH; goto out; } rvm = rproc->vm; hold_process_vm(rvm); mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock); process_unlock(rproc, &lock); if (lproc->euid != 0 && (lproc->ruid != rproc->ruid || lproc->ruid != rproc->euid || lproc->ruid != rproc->suid || lproc->rgid != rproc->rgid || lproc->rgid != rproc->egid || lproc->rgid != rproc->sgid)) { ret = -EPERM; goto out; } dkprintf("pid %d found, doing %s: liovcnt: %d, riovcnt: %d \n", pid, (op == PROCESS_VM_READ) ? "PROCESS_VM_READ" : "PROCESS_VM_WRITE", liovcnt, riovcnt); pli = pri = -1; /* Previous indeces in iovecs */ li = ri = 0; /* Current indeces in iovecs */ loff = roff = 0; /* Offsets in current iovec */ /* Now iterate and do the copy */ while (copied < llen) { int faulted = 0; /* New local vector? */ if (pli != li) { struct vm_range *range; ihk_rwspinlock_read_lock_noirq(<hread->vm->memory_range_lock); /* Is base valid? */ range = lookup_process_memory_range(lthread->vm, (uintptr_t)local_iov[li].iov_base, (uintptr_t)(local_iov[li].iov_base + 1)); if (!range) { ret = -EFAULT; goto pli_out; } /* Is range valid? */ range = lookup_process_memory_range(lthread->vm, (uintptr_t)local_iov[li].iov_base, (uintptr_t)(local_iov[li].iov_base + local_iov[li].iov_len)); if (range == NULL) { ret = -EINVAL; goto pli_out; } if (!(range->flag & ((op == PROCESS_VM_READ) ? VR_PROT_WRITE : VR_PROT_READ))) { ret = -EFAULT; goto pli_out; } ret = 0; pli_out: ihk_rwspinlock_read_unlock_noirq(<hread->vm->memory_range_lock); if (ret != 0) { goto out; } pli = li; } /* New remote vector? */ if (pri != ri) { struct vm_range *range; ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock); /* Is base valid? */ range = lookup_process_memory_range(rvm, (uintptr_t)remote_iov[li].iov_base, (uintptr_t)(remote_iov[li].iov_base + 1)); if (range == NULL) { ret = -EFAULT; goto pri_out; } /* Is range valid? */ range = lookup_process_memory_range(rvm, (uintptr_t)remote_iov[li].iov_base, (uintptr_t)(remote_iov[li].iov_base + remote_iov[li].iov_len)); if (range == NULL) { ret = -EINVAL; goto pri_out; } if (!(range->flag & ((op == PROCESS_VM_READ) ? VR_PROT_READ : VR_PROT_WRITE))) { ret = -EFAULT; goto pri_out; } ret = 0; pri_out: ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock); if (ret != 0) { goto out; } pri = ri; } /* Figure out how much we can copy at most in this iteration */ to_copy = (local_iov[li].iov_len - loff); if ((remote_iov[ri].iov_len - roff) < to_copy) { to_copy = remote_iov[ri].iov_len - roff; } retry_llookup: /* Figure out local physical */ /* TODO: remember page and do this only if necessary */ ret = ihk_mc_pt_virt_to_phys_size(lthread->vm->address_space->page_table, local_iov[li].iov_base + loff, &lphys, &lpsize); if (ret) { uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER; void *addr; if (faulted) { ret = -EFAULT; goto out; } /* Fault in pages */ for (addr = (void *) (((unsigned long)local_iov[li].iov_base + loff) & PAGE_MASK); addr < (local_iov[li].iov_base + loff + to_copy); addr += PAGE_SIZE) { ret = page_fault_process_vm(lthread->vm, addr, reason); if (ret) { ret = -EFAULT; goto out; } } faulted = 1; goto retry_llookup; } lpage_left = ((((unsigned long)local_iov[li].iov_base + loff + lpsize) & ~(lpsize - 1)) - ((unsigned long)local_iov[li].iov_base + loff)); if (lpage_left < to_copy) { to_copy = lpage_left; } lva = phys_to_virt(lphys); retry_rlookup: /* Figure out remote physical */ /* TODO: remember page and do this only if necessary */ ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table, remote_iov[ri].iov_base + roff, &rphys, &rpsize); if (ret) { uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER; void *addr; if (faulted) { ret = -EFAULT; goto out; } /* Fault in pages */ for (addr = (void *) (((unsigned long)remote_iov[ri].iov_base + roff) & PAGE_MASK); addr < (remote_iov[ri].iov_base + roff + to_copy); addr += PAGE_SIZE) { ret = page_fault_process_vm(rvm, addr, reason); if (ret) { ret = -EFAULT; goto out; } } faulted = 1; goto retry_rlookup; } rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff + rpsize) & ~(rpsize - 1)) - ((unsigned long)remote_iov[ri].iov_base + roff)); if (rpage_left < to_copy) { to_copy = rpage_left; } rva = phys_to_virt(rphys); fast_memcpy( (op == PROCESS_VM_READ) ? lva : rva, (op == PROCESS_VM_READ) ? rva : lva, to_copy); copied += to_copy; dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, rpsize: %lu, rpage_left: %lu\n", li, local_iov[li].iov_base + loff, (op == PROCESS_VM_READ) ? "<-" : "->", ri, remote_iov[ri].iov_base + roff, to_copy, rpsize, rpage_left); loff += to_copy; roff += to_copy; if (loff == local_iov[li].iov_len) { li++; loff = 0; } if (roff == remote_iov[ri].iov_len) { ri++; roff = 0; } } release_process_vm(rvm); return copied; out: if(rvm) release_process_vm(rvm); return ret; } int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg) { int i, i_s, i_e, phase = 1; struct move_pages_smp_req *mpsr = (struct move_pages_smp_req *)arg; struct process_vm *vm = mpsr->proc->vm; int count = mpsr->count; struct page_table *save_pt; extern struct page_table *get_init_page_table(void); i_s = (count / nr_cpus) * cpu_index; i_e = i_s + (count / nr_cpus); if (cpu_index == (nr_cpus - 1)) { i_e = count; } /* Load target process' PT so that we can access user-space */ save_pt = cpu_local_var(current) == &cpu_local_var(idle) ? get_init_page_table() : cpu_local_var(current)->vm->address_space->page_table; if (save_pt != vm->address_space->page_table) { ihk_mc_load_page_table(vm->address_space->page_table); } else { save_pt = NULL; } if (nr_cpus == 1) { switch (cpu_index) { case 0: memcpy(mpsr->virt_addr, mpsr->user_virt_addr, sizeof(void *) * count); if (mpsr->user_nodes) { memcpy(mpsr->nodes, mpsr->user_nodes, sizeof(int) * count); } memset(mpsr->ptep, 0, sizeof(pte_t) * count); memset(mpsr->status, 0, sizeof(int) * count); memset(mpsr->nr_pages, 0, sizeof(int) * count); memset(mpsr->dst_phys, 0, sizeof(unsigned long) * count); mpsr->nodes_ready = 1; break; default: break; } } else if (nr_cpus > 1 && nr_cpus < 4) { switch (cpu_index) { case 0: memcpy(mpsr->virt_addr, mpsr->user_virt_addr, sizeof(void *) * count); if (mpsr->user_nodes) { memcpy(mpsr->nodes, mpsr->user_nodes, sizeof(int) * count); } mpsr->nodes_ready = 1; break; case 1: memset(mpsr->ptep, 0, sizeof(pte_t) * count); memset(mpsr->status, 0, sizeof(int) * count); memset(mpsr->nr_pages, 0, sizeof(int) * count); memset(mpsr->dst_phys, 0, sizeof(unsigned long) * count); break; default: break; } } else if (nr_cpus >= 4 && nr_cpus < 7) { switch (cpu_index) { case 0: memcpy(mpsr->virt_addr, mpsr->user_virt_addr, sizeof(void *) * count); break; case 1: if (mpsr->user_nodes) { memcpy(mpsr->nodes, mpsr->user_nodes, sizeof(int) * count); } mpsr->nodes_ready = 1; break; case 2: memset(mpsr->ptep, 0, sizeof(pte_t) * count); memset(mpsr->status, 0, sizeof(int) * count); break; case 3: memset(mpsr->nr_pages, 0, sizeof(int) * count); memset(mpsr->dst_phys, 0, sizeof(unsigned long) * count); break; default: break; } } else { switch (cpu_index) { case 0: memcpy(mpsr->virt_addr, mpsr->user_virt_addr, sizeof(void *) * (count / 2)); break; case 1: memcpy(mpsr->virt_addr + (count / 2), mpsr->user_virt_addr + (count / 2), sizeof(void *) * (count / 2)); break; case 2: if (mpsr->user_nodes) { memcpy(mpsr->nodes, mpsr->user_nodes, sizeof(int) * count); } mpsr->nodes_ready = 1; break; case 3: memset(mpsr->ptep, 0, sizeof(pte_t) * count); break; case 4: memset(mpsr->status, 0, sizeof(int) * count); break; case 5: memset(mpsr->nr_pages, 0, sizeof(int) * count); break; case 6: memset(mpsr->dst_phys, 0, sizeof(unsigned long) * count); break; default: break; } } while (!(volatile int)mpsr->nodes_ready) { cpu_pause(); } /* NUMA verification in parallel */ if (mpsr->user_nodes) { for (i = i_s; i < i_e; i++) { if (mpsr->nodes[i] < 0 || mpsr->nodes[i] >= ihk_mc_get_nr_numa_nodes() || !test_bit(mpsr->nodes[i], mpsr->proc->vm->numa_mask)) { mpsr->phase_ret = -EINVAL; break; } } } /* Barrier */ ihk_atomic_inc(&mpsr->phase_done); while (ihk_atomic_read(&mpsr->phase_done) < (phase * nr_cpus)) { cpu_pause(); } if (mpsr->phase_ret != 0) { goto out; } dkprintf("%s: phase %d done\n", __FUNCTION__, phase); ++phase; /* PTE lookup in parallel */ for (i = i_s; i < i_e; i++) { void *phys; size_t pgsize; int p2align; /* * XXX: No page structures for anonymous mappings. * Look up physical addresses by scanning page tables. */ mpsr->ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table, (void *)mpsr->virt_addr[i], 0, &phys, &pgsize, &p2align); /* PTE valid? */ if (!mpsr->ptep[i] || !pte_is_present(mpsr->ptep[i])) { mpsr->status[i] = -EFAULT; mpsr->ptep[i] = NULL; continue; } /* PTE is file? */ if (pte_is_fileoff(mpsr->ptep[i], PAGE_SIZE)) { mpsr->status[i] = -EINVAL; mpsr->ptep[i] = NULL; continue; } dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n", __FUNCTION__, mpsr->virt_addr[i], pgsize, mpsr->nodes[i]); /* Large page? */ if (pgsize > PAGE_SIZE) { int nr_sub_pages = (pgsize / PAGE_SIZE); int j; if (i + nr_sub_pages > count) { kprintf("%s: ERROR: page at index %d exceeds the region\n", __FUNCTION__, i); mpsr->status[i] = -EINVAL; break; } /* Is it contiguous across nr_sub_pages and all * requested to be moved to the same target node? */ for (j = 0; j < nr_sub_pages; ++j) { if (mpsr->virt_addr[i + j] != (mpsr->virt_addr[i] + (j * PAGE_SIZE)) || mpsr->nodes[i] != mpsr->nodes[i + j]) { kprintf("%s: ERROR: virt address or node at index %d" " is inconsistent\n", __FUNCTION__, i + j); mpsr->phase_ret = -EINVAL; goto pte_out; } } mpsr->nr_pages[i] = nr_sub_pages; i += (nr_sub_pages - 1); } else { mpsr->nr_pages[i] = 1; } } pte_out: /* Barrier */ ihk_atomic_inc(&mpsr->phase_done); while (ihk_atomic_read(&mpsr->phase_done) < (phase * nr_cpus)) { cpu_pause(); } if (mpsr->phase_ret != 0) { goto out; } dkprintf("%s: phase %d done\n", __FUNCTION__, phase); ++phase; /* * When nodes array is NULL, move_pages doesn't move any pages, * instead will return the node where each page * currently resides by status array. */ if (!mpsr->user_nodes) { /* get nid in parallel */ for (i = i_s; i < i_e; i++) { if (mpsr->status[i] < 0) { continue; } mpsr->status[i] = phys_to_nid( pte_get_phys(mpsr->ptep[i])); } mpsr->phase_ret = 0; goto out; // return node information } /* Processing of move pages */ if (cpu_index == 0) { /* Allocate new pages on target NUMA nodes */ for (i = 0; i < count; i++) { int pgalign = 0; int j; void *dst; if (!mpsr->ptep[i] || mpsr->status[i] < 0 || !mpsr->nr_pages[i]) continue; /* TODO: store pgalign info in an array as well? */ if (mpsr->nr_pages[i] > 1) { int nr_pages; for (pgalign = 0, nr_pages = mpsr->nr_pages[i]; nr_pages != 1; pgalign++, nr_pages >>= 1) { } } dst = ihk_mc_alloc_aligned_pages_node(mpsr->nr_pages[i], pgalign, IHK_MC_AP_USER, mpsr->nodes[i]); if (!dst) { mpsr->status[i] = -ENOMEM; continue; } for (j = i; j < (i + mpsr->nr_pages[i]); ++j) { mpsr->status[j] = mpsr->nodes[i]; } mpsr->dst_phys[i] = virt_to_phys(dst); dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d," " allocated phys: 0x%lx\n", __FUNCTION__, mpsr->virt_addr[i], mpsr->nr_pages[i] * PAGE_SIZE, mpsr->nodes[i], pgalign, mpsr->dst_phys[i]); } } /* Barrier */ ihk_atomic_inc(&mpsr->phase_done); while (ihk_atomic_read(&mpsr->phase_done) < (phase * nr_cpus)) { cpu_pause(); } if (mpsr->phase_ret != 0) { goto out; } dkprintf("%s: phase %d done\n", __FUNCTION__, phase); ++phase; /* Copy, PTE update, memfree in parallel */ for (i = i_s; i < i_e; ++i) { if (!mpsr->dst_phys[i]) continue; fast_memcpy(phys_to_virt(mpsr->dst_phys[i]), phys_to_virt(pte_get_phys(mpsr->ptep[i])), mpsr->nr_pages[i] * PAGE_SIZE); ihk_mc_free_pages( phys_to_virt(pte_get_phys(mpsr->ptep[i])), mpsr->nr_pages[i]); pte_update_phys(mpsr->ptep[i], mpsr->dst_phys[i]); dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n", __FUNCTION__, mpsr->virt_addr[i], mpsr->nr_pages[i] * PAGE_SIZE, mpsr->dst_phys[i]); } /* XXX: do a separate SMP call with only CPUs running threads * of this process? */ if (cpu_local_var(current)->proc == mpsr->proc) { /* Invalidate all TLBs */ for (i = 0; i < mpsr->count; i++) { if (!mpsr->dst_phys[i]) continue; flush_tlb_single((unsigned long)mpsr->virt_addr[i]); } } out: if (save_pt) { ihk_mc_load_page_table(save_pt); } return mpsr->phase_ret; } time_t time(void) { struct timespec ats; time_t ret = 0; if (gettime_local_support) { calculate_time_from_tsc(&ats); ret = ats.tv_sec; } return ret; } SYSCALL_DECLARE(time) { return time(); } void calculate_time_from_tsc(struct timespec *ts) { long ver; unsigned long current_tsc; time_t sec_delta; long ns_delta; for (;;) { while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) { /* settimeofday() is in progress */ cpu_pause(); } rmb(); /* fetch version before time */ *ts = tod_data.origin; rmb(); /* fetch time before checking version */ if (ver == ihk_atomic64_read(&tod_data.version)) { break; } /* settimeofday() has intervened */ cpu_pause(); } current_tsc = rdtsc(); sec_delta = current_tsc / tod_data.clocks_per_sec; ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec) / tod_data.clocks_per_sec; /* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */ ts->tv_sec += sec_delta; ts->tv_nsec += ns_delta; if (ts->tv_nsec >= NS_PER_SEC) { ts->tv_nsec -= NS_PER_SEC; ++ts->tv_sec; } } extern void ptrace_syscall_event(struct thread *thread); long arch_ptrace_syscall_event(struct thread *thread, ihk_mc_user_context_t *ctx, long setret) { ptrace_syscall_event(thread); return setret; } /*** End of File ***/