11457 lines
283 KiB
C
11457 lines
283 KiB
C
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
|
|
/**
|
|
* \file syscall.c
|
|
* License details are found in the file LICENSE.
|
|
* \brief
|
|
* system call handlers
|
|
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
|
|
* Copyright (C) 2011 - 2012 Taku Shimosawa
|
|
* \author Balazs Gerofi <bgerofi@riken.jp> \par
|
|
* Copyright (C) 2012 RIKEN AICS
|
|
* \author Masamichi Takagi <m-takagi@ab.jp.nec.com> \par
|
|
* Copyright (C) 2012 - 2013 NEC Corporation
|
|
* \author Min Si <msi@is.s.u-tokyo.ac.jp> \par
|
|
* Copyright (C) 2012 Min Si
|
|
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
|
|
* Copyright (C) 2013 The University of Tokyo
|
|
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
|
|
* Copyright (C) 2013 Hitachi, Ltd.
|
|
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
|
|
* Copyright (C) 2013 Hitachi, Ltd.
|
|
*/
|
|
/*
|
|
* HISTORY:
|
|
*/
|
|
|
|
#include <types.h>
|
|
#include <kmsg.h>
|
|
#include <ihk/cpu.h>
|
|
#include <cpulocal.h>
|
|
#include <ihk/mm.h>
|
|
#include <ihk/ikc.h>
|
|
#include <errno.h>
|
|
#include <cls.h>
|
|
#include <syscall.h>
|
|
#include <page.h>
|
|
#include <amemcpy.h>
|
|
#include <uio.h>
|
|
#include <ihk/lock.h>
|
|
#include <ctype.h>
|
|
#include <waitq.h>
|
|
#include <rlimit.h>
|
|
#include <affinity.h>
|
|
#include <time.h>
|
|
#include <ihk/perfctr.h>
|
|
#include <mman.h>
|
|
#include <kmalloc.h>
|
|
#include <memobj.h>
|
|
#include <shm.h>
|
|
#include <prio.h>
|
|
#include <arch/cpu.h>
|
|
#include <limits.h>
|
|
#include <mc_perf_event.h>
|
|
#include <march.h>
|
|
#include <process.h>
|
|
#include <bitops.h>
|
|
#include <bitmap.h>
|
|
#include <xpmem.h>
|
|
#include <rusage_private.h>
|
|
#include <ihk/monitor.h>
|
|
#include <profile.h>
|
|
#include <ihk/debug.h>
|
|
#include "../executer/include/uti.h"
|
|
|
|
/* Headers taken from kitten LWK */
|
|
#include <lwk/stddef.h>
|
|
#include <futex.h>
|
|
|
|
#define SYSCALL_BY_IKC
|
|
|
|
//#define DEBUG_PRINT_SC
|
|
|
|
#ifdef DEBUG_PRINT_SC
|
|
#undef DDEBUG_DEFAULT
|
|
#define DDEBUG_DEFAULT DDEBUG_PRINT
|
|
#endif
|
|
|
|
//static ihk_atomic_t pid_cnt = IHK_ATOMIC_INIT(1024);
|
|
|
|
/* generate system call handler's prototypes */
|
|
#define SYSCALL_HANDLED(number,name) extern long sys_##name(int n, ihk_mc_user_context_t *ctx);
|
|
#define SYSCALL_DELEGATED(number,name)
|
|
#include <syscall_list.h>
|
|
#undef SYSCALL_HANDLED
|
|
#undef SYSCALL_DELEGATED
|
|
|
|
/* generate syscall_table[] */
|
|
static long (*syscall_table[])(int, ihk_mc_user_context_t *) = {
|
|
#define SYSCALL_HANDLED(number,name) [number] = &sys_##name,
|
|
#define SYSCALL_DELEGATED(number,name)
|
|
#include <syscall_list.h>
|
|
#undef SYSCALL_HANDLED
|
|
#undef SYSCALL_DELEGATED
|
|
};
|
|
|
|
/* generate syscall_name[] */
|
|
#define MCKERNEL_UNUSED __attribute__ ((unused))
|
|
char *syscall_name[] MCKERNEL_UNUSED = {
|
|
#define DECLARATOR(number,name) [number] = #name,
|
|
#define SYSCALL_HANDLED(number,name) DECLARATOR(number,#name)
|
|
#define SYSCALL_DELEGATED(number,name) DECLARATOR(number,#name)
|
|
#include <syscall_list.h>
|
|
#undef DECLARATOR
|
|
#undef SYSCALL_HANDLED
|
|
#undef SYSCALL_DELEGATED
|
|
};
|
|
|
|
static ihk_spinlock_t tod_data_lock = SPIN_LOCK_UNLOCKED;
|
|
static unsigned long uti_desc; /* Address of struct uti_desc object in syscall_intercept.c */
|
|
|
|
void save_syscall_return_value(int num, unsigned long rc);
|
|
extern long alloc_debugreg(struct thread *thread);
|
|
extern int num_processors;
|
|
extern unsigned long ihk_mc_get_ns_per_tsc(void);
|
|
extern int ptrace_detach(int pid, int data);
|
|
extern void debug_log(unsigned long);
|
|
extern long arch_ptrace(long request, int pid, long addr, long data);
|
|
extern struct cpu_local_var *clv;
|
|
|
|
int prepare_process_ranges_args_envs(struct thread *thread,
|
|
struct program_load_desc *pn,
|
|
struct program_load_desc *p,
|
|
enum ihk_mc_pt_attribute attr,
|
|
char *args, int args_len,
|
|
char *envs, int envs_len);
|
|
|
|
#ifdef DCFA_KMOD
|
|
static void do_mod_exit(int status);
|
|
#endif
|
|
|
|
/* Size of tid table. It needs to be more than #CPUs when CPU
|
|
* oversubscription is needed. The examples of CPU oversubscription are:
|
|
* (1) pmi_proxy + gdb + #CPU OMP threads
|
|
* (2) pmi_proxy + #CPU OMP threads + POSIX AIO IO + POSIX AIO notification
|
|
*/
|
|
#define NR_TIDS (allow_oversubscribe ? (num_processors * 2) : num_processors)
|
|
|
|
long (*linux_wait_event)(void *_resp, unsigned long nsec_timeout);
|
|
int (*linux_printk)(const char *fmt, ...);
|
|
int (*linux_clock_gettime)(clockid_t clk_id, struct timespec *tp);
|
|
|
|
static void send_syscall(struct syscall_request *req, int cpu,
|
|
struct syscall_response *res)
|
|
{
|
|
struct ikc_scd_packet packet IHK_DMA_ALIGN;
|
|
struct ihk_ikc_channel_desc *syscall_channel = get_cpu_local_var(cpu)->ikc2linux;
|
|
int ret;
|
|
|
|
res->status = 0;
|
|
req->valid = 0;
|
|
|
|
memcpy(&packet.req, req, sizeof(*req));
|
|
|
|
barrier();
|
|
smp_store_release(&packet.req.valid, 1);
|
|
|
|
#ifdef SYSCALL_BY_IKC
|
|
packet.msg = SCD_MSG_SYSCALL_ONESIDE;
|
|
packet.ref = cpu;
|
|
packet.pid = cpu_local_var(current)->proc->pid;
|
|
packet.resp_pa = virt_to_phys(res);
|
|
dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid);
|
|
|
|
ret = ihk_ikc_send(syscall_channel, &packet, 0);
|
|
if (ret < 0) {
|
|
kprintf("ERROR: sending IKC msg, ret: %d\n", ret);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
long do_syscall(struct syscall_request *req, int cpu)
|
|
{
|
|
struct syscall_response res;
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
|
int mstatus = 0;
|
|
|
|
#ifdef PROFILE_ENABLE
|
|
/* We cannot use thread->profile_start_ts here because the
|
|
* caller may be utilizing it already */
|
|
unsigned long t_s = 0;
|
|
if (thread->profile) {
|
|
t_s = rdtsc();
|
|
}
|
|
#endif // PROFILE_ENABLE
|
|
|
|
dkprintf("SC(%d)[%3d] sending syscall\n",
|
|
ihk_mc_get_processor_id(),
|
|
req->number);
|
|
|
|
mstatus = monitor->status;
|
|
monitor->status = IHK_OS_MONITOR_KERNEL_OFFLOAD;
|
|
|
|
barrier();
|
|
|
|
if(req->number != __NR_exit_group){
|
|
++thread->in_syscall_offload;
|
|
}
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
#if 0
|
|
if (req->number == __NR_write && req->args[0] == 1) {
|
|
return req->args[2];
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
/* The current thread is the requester */
|
|
req->rtid = cpu_local_var(current)->tid;
|
|
|
|
if (req->number == __NR_sched_setaffinity && req->args[0] == 0) {
|
|
/* mcexec thread serving migrate-to-Linux request must have
|
|
the same tid as the requesting McKernel thread because the
|
|
serving thread jumps to hfi driver and then jumps to
|
|
rus_vm_fault() without registering it into per thread data
|
|
by mcctrl_add_per_thread_data()). */
|
|
req->ttid = cpu_local_var(current)->tid/*0*/;
|
|
dkprintf("%s: uti, ttid=%d\n", __FUNCTION__, req->ttid);
|
|
} else {
|
|
/* Any thread from the pool may serve the request */
|
|
req->ttid = 0;
|
|
}
|
|
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
|
|
#ifdef ENABLE_TOFU
|
|
res.pde_data = NULL;
|
|
#endif
|
|
send_syscall(req, cpu, &res);
|
|
|
|
if (req->rtid == -1) {
|
|
preempt_disable();
|
|
}
|
|
|
|
dkprintf("%s: syscall num: %d waiting for Linux.. \n",
|
|
__FUNCTION__, req->number);
|
|
|
|
#define STATUS_IN_PROGRESS 0
|
|
#define STATUS_COMPLETED 1
|
|
#define STATUS_PAGE_FAULT 3
|
|
#define STATUS_SYSCALL 4
|
|
#define __NR_syscall_response 8001
|
|
while (smp_load_acquire(&res.status) != STATUS_COMPLETED) {
|
|
while (smp_load_acquire(&res.status) == STATUS_IN_PROGRESS) {
|
|
struct cpu_local_var *v;
|
|
int do_schedule = 0;
|
|
long runq_irqstate;
|
|
unsigned long flags;
|
|
DECLARE_WAITQ_ENTRY(scd_wq_entry, cpu_local_var(current));
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
if (req->number == __NR_epoll_wait ||
|
|
req->number == __NR_epoll_pwait)
|
|
goto schedule;
|
|
#endif
|
|
|
|
if (thread->rpf_backlog) {
|
|
void (*func)(void *) = thread->rpf_backlog;
|
|
void *arg = thread->rpf_arg;
|
|
|
|
thread->rpf_backlog = NULL;
|
|
thread->rpf_arg = NULL;
|
|
func(arg);
|
|
kfree(arg);
|
|
}
|
|
|
|
check_sig_pending();
|
|
cpu_pause();
|
|
|
|
/* Spin if not preemptable */
|
|
if (ihk_atomic_read(&cpu_local_var(no_preempt))
|
|
|| !thread->tid) {
|
|
continue;
|
|
}
|
|
|
|
/* Spin by default, but if re-schedule is requested let
|
|
* the other thread run */
|
|
runq_irqstate = cpu_disable_interrupt_save();
|
|
ihk_mc_spinlock_lock_noirq(
|
|
&(get_this_cpu_local_var()->runq_lock));
|
|
v = get_this_cpu_local_var();
|
|
|
|
if (v->flags & CPU_FLAG_NEED_RESCHED ||
|
|
v->runq_len > 1 ||
|
|
req->number == __NR_sched_setaffinity) {
|
|
v->flags &= ~CPU_FLAG_NEED_RESCHED;
|
|
do_schedule = 1;
|
|
}
|
|
|
|
ihk_mc_spinlock_unlock_noirq(&v->runq_lock);
|
|
cpu_restore_interrupt(runq_irqstate);
|
|
|
|
if (!do_schedule) {
|
|
ihk_numa_zero_free_pages(ihk_mc_get_numa_node_by_distance(0));
|
|
continue;
|
|
}
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
schedule:
|
|
#endif
|
|
flags = cpu_disable_interrupt_save();
|
|
|
|
/* Try to sleep until notified */
|
|
if (smp_load_acquire(&res.req_thread_status) ==
|
|
IHK_SCD_REQ_THREAD_DESCHEDULED ||
|
|
(cmpxchg(&res.req_thread_status,
|
|
IHK_SCD_REQ_THREAD_SPINNING,
|
|
IHK_SCD_REQ_THREAD_DESCHEDULED) ==
|
|
IHK_SCD_REQ_THREAD_SPINNING)) {
|
|
dkprintf("%s: tid %d waiting for syscall reply...\n",
|
|
__FUNCTION__, thread->tid);
|
|
waitq_init(&thread->scd_wq);
|
|
waitq_prepare_to_wait(&thread->scd_wq, &scd_wq_entry,
|
|
PS_INTERRUPTIBLE);
|
|
cpu_restore_interrupt(flags);
|
|
schedule();
|
|
waitq_finish_wait(&thread->scd_wq, &scd_wq_entry);
|
|
continue;
|
|
}
|
|
else {
|
|
if (do_schedule) {
|
|
runq_irqstate =
|
|
ihk_mc_spinlock_lock(
|
|
&v->runq_lock);
|
|
v->flags |= CPU_FLAG_NEED_RESCHED;
|
|
ihk_mc_spinlock_unlock(
|
|
&v->runq_lock, runq_irqstate);
|
|
}
|
|
}
|
|
|
|
cpu_restore_interrupt(flags);
|
|
}
|
|
|
|
if (smp_load_acquire(&res.status) == STATUS_SYSCALL) {
|
|
struct syscall_request *requestp;
|
|
struct syscall_request request;
|
|
int num;
|
|
ihk_mc_user_context_t ctx;
|
|
int ns;
|
|
unsigned long syscall_ret;
|
|
unsigned long phys;
|
|
struct syscall_request req2 IHK_DMA_ALIGN; /* debug */
|
|
|
|
phys = ihk_mc_map_memory(NULL, res.fault_address,
|
|
sizeof(struct syscall_request));
|
|
requestp = ihk_mc_map_virtual(phys, 1,
|
|
PTATTR_WRITABLE | PTATTR_ACTIVE);
|
|
memcpy(&request, requestp, sizeof request);
|
|
ihk_mc_unmap_virtual(requestp, 1);
|
|
ihk_mc_unmap_memory(NULL, phys,
|
|
sizeof(struct syscall_request));
|
|
num = request.number;
|
|
|
|
if (num == __NR_rt_sigaction) {
|
|
int sig = request.args[0];
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
sig--;
|
|
if (sig < 0 || sig >= _NSIG)
|
|
syscall_ret = -EINVAL;
|
|
else
|
|
syscall_ret = (unsigned long)thread->
|
|
sigcommon->action[sig].
|
|
sa.sa_handler;
|
|
}
|
|
else {
|
|
ns = (sizeof syscall_table /
|
|
sizeof syscall_table[0]);
|
|
if (num >= 0 && num < ns &&
|
|
syscall_table[num]) {
|
|
ihk_mc_syscall_arg0(&ctx) =
|
|
request.args[0];
|
|
ihk_mc_syscall_arg1(&ctx) =
|
|
request.args[1];
|
|
ihk_mc_syscall_arg2(&ctx) =
|
|
request.args[2];
|
|
ihk_mc_syscall_arg3(&ctx) =
|
|
request.args[3];
|
|
ihk_mc_syscall_arg4(&ctx) =
|
|
request.args[4];
|
|
ihk_mc_syscall_arg5(&ctx) =
|
|
request.args[5];
|
|
syscall_ret = syscall_table[num](num,
|
|
&ctx);
|
|
}
|
|
else
|
|
syscall_ret = -ENOSYS;
|
|
}
|
|
|
|
/* send result */
|
|
req2.number = __NR_syscall_response;
|
|
req2.args[1] = syscall_ret;
|
|
/* The current thread is the requester and only the waiting thread
|
|
* may serve the request */
|
|
req2.rtid = cpu_local_var(current)->tid;
|
|
req2.ttid = res.stid;
|
|
|
|
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
|
|
send_syscall(&req2, cpu, &res);
|
|
}
|
|
}
|
|
if (req->rtid == -1) {
|
|
preempt_enable();
|
|
}
|
|
|
|
dkprintf("%s: syscall num: %d got host reply: %d \n",
|
|
__FUNCTION__, req->number, res.ret);
|
|
|
|
rc = res.ret;
|
|
|
|
#ifdef ENABLE_TOFU
|
|
if ((req->number == __NR_ioctl && rc == 0) ||
|
|
(req->number == __NR_openat && rc > 0)) {
|
|
int fd = req->number == __NR_ioctl ? req->args[0] : rc;
|
|
char *path = req->number == __NR_ioctl ?
|
|
thread->proc->fd_path[fd] : thread->fd_path_in_open;
|
|
|
|
if (cpu_local_var(current)->proc->enable_tofu &&
|
|
res.pde_data &&
|
|
fd < MAX_FD_PDE &&
|
|
!thread->proc->fd_pde_data[fd] &&
|
|
!strncmp(path, "/proc/tofu/dev/", 15)) {
|
|
unsigned long irqstate;
|
|
|
|
irqstate = ihk_mc_spinlock_lock(&thread->proc->mckfd_lock);
|
|
thread->proc->fd_pde_data[fd] = res.pde_data;
|
|
ihk_mc_spinlock_unlock(&thread->proc->mckfd_lock, irqstate);
|
|
|
|
dkprintf("%s: PID: %d, ioctl fd: %d, filename: "
|
|
"%s, pde_data: 0x%lx\n",
|
|
__FUNCTION__,
|
|
thread->proc->pid,
|
|
fd,
|
|
path,
|
|
res.pde_data);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if(req->number != __NR_exit_group){
|
|
--thread->in_syscall_offload;
|
|
}
|
|
|
|
/* -ERESTARTSYS indicates that the proxy process is gone
|
|
* and the application should be terminated */
|
|
if (rc == -ERESTARTSYS) {
|
|
dkprintf("%s: proxy PID %d is dead, terminate()\n",
|
|
__FUNCTION__, thread->proc->pid);
|
|
thread->proc->nohost = 1;
|
|
}
|
|
|
|
#ifdef PROFILE_ENABLE
|
|
if (req->number < PROFILE_SYSCALL_MAX) {
|
|
profile_event_add(profile_syscall2offload(req->number),
|
|
(rdtsc() - t_s));
|
|
}
|
|
else {
|
|
dkprintf("%s: offload syscall > %d ?? : %d\n",
|
|
__FUNCTION__, PROFILE_SYSCALL_MAX, req->number);
|
|
}
|
|
#endif // PROFILE_ENABLE
|
|
|
|
monitor->status = mstatus;
|
|
monitor->counter++;
|
|
return rc;
|
|
}
|
|
|
|
long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx)
|
|
{
|
|
SYSCALL_HEADER;
|
|
dkprintf("syscall_generic_forwarding(%d)\n", n);
|
|
SYSCALL_ARGS_6(D,D,D,D,D,D);
|
|
SYSCALL_FOOTER;
|
|
}
|
|
|
|
static int wait_zombie(struct thread *thread, struct process *child, int *status, int options) {
|
|
int ret;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
dkprintf("wait_zombie,found PS_ZOMBIE process: %d\n", child->pid);
|
|
|
|
if (status) {
|
|
*status = child->group_exit_status;
|
|
}
|
|
|
|
if(child->ppid_parent->pid != thread->proc->pid || child->nowait)
|
|
return child->pid;
|
|
request.number = __NR_wait4;
|
|
request.args[0] = child->pid;
|
|
request.args[1] = 0;
|
|
request.args[2] = options;
|
|
/* Ask host to clean up exited child */
|
|
ret = do_syscall(&request, ihk_mc_get_processor_id());
|
|
|
|
if (ret != child->pid)
|
|
kprintf("WARNING: host waitpid failed?\n");
|
|
dkprintf("wait_zombie,child->pid=%d,status=%08x\n",
|
|
child->pid, status ? *status : -1);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int wait_stopped(struct thread *thread, struct process *child, struct thread *c_thread, int *status, int options)
|
|
{
|
|
dkprintf("wait_stopped,proc->pid=%d,child->pid=%d,options=%08x\n",
|
|
thread->proc->pid, child->pid, options);
|
|
int ret;
|
|
|
|
if (c_thread) {
|
|
/* Skip this process because exit_status has been reaped. */
|
|
if (!c_thread->exit_status) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* TODO: define 0x7f in kernel/include/process.h */
|
|
if (status) {
|
|
*status = (c_thread->exit_status << 8) | 0x7f;
|
|
}
|
|
|
|
/* Reap exit_status. signal_flags is reaped on receiving */
|
|
/* signal in do_kill(). */
|
|
if (!(options & WNOWAIT)) {
|
|
c_thread->exit_status = 0;
|
|
}
|
|
}
|
|
else if (child->status & (PS_STOPPED | PS_DELAY_STOPPED)) {
|
|
/* Skip this process because exit_status has been reaped. */
|
|
if (!child->group_exit_status) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* TODO: define 0x7f in kernel/include/process.h */
|
|
if (status) {
|
|
*status = (child->group_exit_status << 8) | 0x7f;
|
|
}
|
|
|
|
/* Reap exit_status. signal_flags is reaped on receiving */
|
|
/* signal in do_kill(). */
|
|
if (!(options & WNOWAIT)) {
|
|
child->group_exit_status = 0;
|
|
}
|
|
}
|
|
else {
|
|
/* Skip this process because exit_status has been reaped. */
|
|
if (!child->main_thread->exit_status) {
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* TODO: define 0x7f in kernel/include/process.h */
|
|
if (status) {
|
|
*status = (child->main_thread->exit_status << 8) | 0x7f;
|
|
}
|
|
|
|
/* Reap exit_status. signal_flags is reaped on receiving */
|
|
/* signal in do_kill(). */
|
|
if (!(options & WNOWAIT)) {
|
|
child->main_thread->exit_status = 0;
|
|
}
|
|
}
|
|
|
|
dkprintf("wait_stopped,child->pid=%d,status=%08x\n",
|
|
child->pid, status ? *status : -1);
|
|
ret = c_thread ? c_thread->tid : child->pid;
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int wait_continued(struct thread *thread, struct process *child,
|
|
struct thread *c_thread, int *status, int options)
|
|
{
|
|
int ret;
|
|
|
|
if (status) {
|
|
*status = 0xffff;
|
|
}
|
|
|
|
/* Reap signal_flags */
|
|
if(!(options & WNOWAIT)) {
|
|
if (c_thread)
|
|
c_thread->signal_flags &= ~SIGNAL_STOP_CONTINUED;
|
|
else
|
|
child->main_thread->signal_flags &=
|
|
~SIGNAL_STOP_CONTINUED;
|
|
}
|
|
|
|
dkprintf("wait4,SIGNAL_STOP_CONTINUED,pid=%d,status=%08x\n",
|
|
child->pid, status ? *status : -1);
|
|
ret = c_thread ? c_thread->tid : child->pid;
|
|
return ret;
|
|
}
|
|
|
|
static void
|
|
thread_exit_signal(struct thread *thread)
|
|
{
|
|
int sig;
|
|
struct siginfo info;
|
|
int error;
|
|
struct timespec ats;
|
|
|
|
if (thread->report_proc == NULL) {
|
|
return;
|
|
}
|
|
|
|
if (thread->ptrace)
|
|
sig = SIGCHLD;
|
|
else
|
|
sig = thread->termsig;
|
|
memset(&info, '\0', sizeof(info));
|
|
info.si_signo = sig;
|
|
info.si_code = (thread->exit_status & 0x7f) ?
|
|
((thread->exit_status & 0x80) ?
|
|
CLD_DUMPED : CLD_KILLED) : CLD_EXITED;
|
|
info._sifields._sigchld.si_pid = thread->tid;
|
|
info._sifields._sigchld.si_status = thread->exit_status;
|
|
tsc_to_ts(thread->user_tsc, &ats);
|
|
info._sifields._sigchld.si_utime = timespec_to_jiffy(&ats);
|
|
tsc_to_ts(thread->system_tsc, &ats);
|
|
info._sifields._sigchld.si_stime = timespec_to_jiffy(&ats);
|
|
error = do_kill(NULL, thread->report_proc->pid, -1, sig, &info, 0);
|
|
dkprintf("terminate,klll %d,error=%d\n", sig, error);
|
|
/* Wake parent (if sleeping in wait4()) */
|
|
waitq_wakeup(&thread->report_proc->waitpid_q);
|
|
}
|
|
|
|
static void
|
|
finalize_process(struct process *proc)
|
|
{
|
|
struct resource_set *resource_set = cpu_local_var(resource_set);
|
|
struct process *pid1 = resource_set->pid1;
|
|
int exit_status = proc->group_exit_status;
|
|
struct mcs_rwlock_node updatelock;
|
|
|
|
mcs_rwlock_writer_lock_noirq(&proc->update_lock, &updatelock);
|
|
// Send signal to parent
|
|
if (proc->parent == pid1) {
|
|
proc->status = PS_ZOMBIE;
|
|
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
|
|
release_process(proc);
|
|
}
|
|
else {
|
|
proc->status = PS_ZOMBIE;
|
|
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
|
|
|
|
dkprintf("terminate,wakeup\n");
|
|
|
|
/* Signal parent if still attached */
|
|
if (proc->termsig != 0) {
|
|
struct siginfo info;
|
|
int error;
|
|
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = SIGCHLD;
|
|
info.si_code = (exit_status & 0x7f)?
|
|
((exit_status & 0x80)?
|
|
CLD_DUMPED: CLD_KILLED): CLD_EXITED;
|
|
info._sifields._sigchld.si_pid = proc->pid;
|
|
info._sifields._sigchld.si_status = exit_status;
|
|
info._sifields._sigchld.si_utime =
|
|
timespec_to_jiffy(&proc->utime);
|
|
info._sifields._sigchld.si_stime =
|
|
timespec_to_jiffy(&proc->stime);
|
|
error = do_kill(NULL, proc->parent->pid, -1, SIGCHLD, &info, 0);
|
|
dkprintf("terminate,klll %d,error=%d\n",
|
|
proc->termsig, error);
|
|
}
|
|
/* Wake parent (if sleeping in wait4()) */
|
|
waitq_wakeup(&proc->parent->waitpid_q);
|
|
}
|
|
}
|
|
|
|
static void
|
|
ptrace_detach_thread(struct thread *thread, int data)
|
|
{
|
|
struct resource_set *resource_set = cpu_local_var(resource_set);
|
|
struct process *pid1 = resource_set->pid1;
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct process *proc = mythread->proc;
|
|
struct process *report_proc = NULL;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
struct process *term_proc = NULL;
|
|
|
|
if (thread == thread->proc->main_thread) {
|
|
struct process *tracee_proc = thread->proc;
|
|
struct process *parent = tracee_proc->ppid_parent;
|
|
|
|
if (thread->proc->status == PS_ZOMBIE &&
|
|
thread->proc->parent != parent) {
|
|
term_proc = thread->proc;
|
|
}
|
|
mcs_rwlock_reader_lock(&proc->children_lock, &lock);
|
|
|
|
list_del(&tracee_proc->siblings_list);
|
|
mcs_rwlock_reader_unlock(&proc->children_lock, &lock);
|
|
|
|
mcs_rwlock_reader_lock(&tracee_proc->children_lock, &lock);
|
|
list_del(&tracee_proc->ptraced_siblings_list);
|
|
list_add_tail(&tracee_proc->siblings_list,
|
|
&parent->children_list);
|
|
tracee_proc->parent = parent;
|
|
|
|
mcs_rwlock_reader_unlock(&tracee_proc->children_lock, &lock);
|
|
}
|
|
if (thread->termsig &&
|
|
thread->termsig != SIGCHLD &&
|
|
thread->proc != pid1) {
|
|
report_proc = thread->proc;
|
|
}
|
|
thread->report_proc = report_proc;
|
|
mcs_rwlock_reader_lock(&proc->threads_lock, &lock);
|
|
list_del(&thread->report_siblings_list);
|
|
mcs_rwlock_reader_unlock(&proc->threads_lock, &lock);
|
|
thread->ptrace = 0;
|
|
kfree(thread->ptrace_debugreg);
|
|
thread->ptrace_debugreg = NULL;
|
|
|
|
clear_single_step(thread);
|
|
if (report_proc) {
|
|
mcs_rwlock_reader_lock(&report_proc->threads_lock, &lock);
|
|
list_add_tail(&thread->report_siblings_list,
|
|
&report_proc->report_threads_list);
|
|
mcs_rwlock_reader_unlock(&report_proc->threads_lock, &lock);
|
|
if (thread->status == PS_EXITED ||
|
|
thread->status == PS_ZOMBIE) {
|
|
/*
|
|
* Traced thread reports to the original parent with
|
|
* the termination signal in addition to the report
|
|
* to the tracer.
|
|
*/
|
|
thread_exit_signal(thread);
|
|
}
|
|
}
|
|
|
|
if (data) {
|
|
struct siginfo info;
|
|
|
|
memset(&info, '\0', sizeof(info));
|
|
info.si_signo = data;
|
|
info.si_code = SI_USER;
|
|
info._sifields._kill.si_pid = proc->pid;
|
|
do_kill(mythread, thread->proc->pid, thread->tid,
|
|
data, &info, 1);
|
|
}
|
|
sched_wakeup_thread(thread, PS_TRACED | PS_STOPPED);
|
|
release_thread(thread);
|
|
if (term_proc) {
|
|
finalize_process(term_proc);
|
|
}
|
|
}
|
|
|
|
static void
|
|
set_process_rusage(struct process *proc, struct rusage *usage)
|
|
{
|
|
ts_to_tv(&usage->ru_utime, &proc->utime);
|
|
ts_to_tv(&usage->ru_stime, &proc->stime);
|
|
usage->ru_maxrss = proc->maxrss / 1024;
|
|
}
|
|
|
|
static int
|
|
wait_proc(int pid, int *status, int options, void *rusage, int *empty)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct process *child, *next;
|
|
int pgid = proc->pgid;
|
|
int ret = 0;
|
|
struct mcs_rwlock_node lock;
|
|
struct mcs_rwlock_node child_lock;
|
|
struct thread *c_thread = NULL;
|
|
|
|
mcs_rwlock_writer_lock_noirq(&proc->children_lock, &lock);
|
|
list_for_each_entry_safe(child, next, &proc->children_list,
|
|
siblings_list) {
|
|
/*
|
|
* Find thread with pid == tid, this will be either the main
|
|
* thread or the one we are looking for specifically when
|
|
* __WCLONE is passed
|
|
*/
|
|
if ((pid >= 0 || -pid != child->pgid) &&
|
|
pid != -1 &&
|
|
(pid != 0 || pgid != child->pgid) &&
|
|
(pid <= 0 || pid != child->pid))
|
|
continue;
|
|
|
|
*empty = 0;
|
|
|
|
if ((options & WEXITED) &&
|
|
child->status == PS_ZOMBIE) {
|
|
ret = wait_zombie(thread, child, status, options);
|
|
if (!(options & WNOWAIT) &&
|
|
child->parent == child->ppid_parent) {
|
|
struct mcs_rwlock_node updatelock;
|
|
struct mcs_rwlock_node childlock;
|
|
struct process *pid1;
|
|
|
|
pid1 = cpu_local_var(resource_set)->pid1;
|
|
|
|
mcs_rwlock_writer_lock_noirq(&proc->update_lock,
|
|
&updatelock);
|
|
ts_add(&proc->stime_children, &child->stime);
|
|
ts_add(&proc->utime_children, &child->utime);
|
|
ts_add(&proc->stime_children,
|
|
&child->stime_children);
|
|
ts_add(&proc->utime_children,
|
|
&child->utime_children);
|
|
if (child->maxrss > proc->maxrss_children)
|
|
proc->maxrss_children = child->maxrss;
|
|
if (child->maxrss_children >
|
|
proc->maxrss_children)
|
|
proc->maxrss_children =
|
|
child->maxrss_children;
|
|
set_process_rusage(child, rusage);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&proc->update_lock, &updatelock);
|
|
list_del(&child->siblings_list);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&proc->children_lock, &lock);
|
|
|
|
mcs_rwlock_writer_lock_noirq(
|
|
&child->update_lock, &updatelock);
|
|
child->parent = pid1;
|
|
child->ppid_parent = pid1;
|
|
mcs_rwlock_writer_lock_noirq(
|
|
&pid1->children_lock, &childlock);
|
|
list_add_tail(&child->siblings_list,
|
|
&pid1->children_list);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&pid1->children_lock, &childlock);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->update_lock, &updatelock);
|
|
mcs_rwlock_writer_lock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
c_thread = child->main_thread;
|
|
if (c_thread &&
|
|
(c_thread->ptrace & PT_TRACED)) {
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
ptrace_detach_thread(c_thread, 0);
|
|
}
|
|
else {
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
}
|
|
release_process(child);
|
|
}
|
|
else{
|
|
mcs_rwlock_writer_lock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
c_thread = child->main_thread;
|
|
if (c_thread && !(options & WNOWAIT) &&
|
|
(c_thread->ptrace & PT_TRACED)) {
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&proc->children_lock, &lock);
|
|
ptrace_detach_thread(c_thread, 0);
|
|
}
|
|
else {
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&proc->children_lock, &lock);
|
|
}
|
|
}
|
|
|
|
goto out_found;
|
|
}
|
|
|
|
mcs_rwlock_writer_lock_noirq(&child->threads_lock, &child_lock);
|
|
c_thread = child->main_thread;
|
|
|
|
if (!(c_thread->ptrace & PT_TRACED) &&
|
|
(c_thread->signal_flags & SIGNAL_STOP_STOPPED) &&
|
|
(options & WUNTRACED)) {
|
|
/*
|
|
* Not ptraced and in stopped state and WUNTRACED is
|
|
* specified
|
|
*/
|
|
ret = wait_stopped(thread, child, NULL, status,
|
|
options);
|
|
if (!(options & WNOWAIT)) {
|
|
c_thread->signal_flags &= ~SIGNAL_STOP_STOPPED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&proc->children_lock,
|
|
&lock);
|
|
mcs_rwlock_writer_unlock_noirq(&child->threads_lock,
|
|
&child_lock);
|
|
goto out_found;
|
|
}
|
|
|
|
if ((c_thread->ptrace & PT_TRACED) &&
|
|
(child->status & (PS_STOPPED | PS_TRACED))) {
|
|
ret = wait_stopped(thread, child, NULL, status,
|
|
options);
|
|
if (ret == child->pid) {
|
|
/* Are we looking for a specific thread? */
|
|
if (pid == c_thread->tid) {
|
|
ret = c_thread->tid;
|
|
}
|
|
if (!(options & WNOWAIT)) {
|
|
c_thread->signal_flags &=
|
|
~SIGNAL_STOP_STOPPED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&proc->children_lock, &lock);
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&child->threads_lock, &child_lock);
|
|
goto out_found;
|
|
}
|
|
}
|
|
|
|
if ((c_thread->signal_flags & SIGNAL_STOP_CONTINUED) &&
|
|
(options & WCONTINUED)) {
|
|
ret = wait_continued(thread, child, NULL, status,
|
|
options);
|
|
if (!(options & WNOWAIT)) {
|
|
c_thread->signal_flags &=
|
|
~SIGNAL_STOP_CONTINUED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&proc->children_lock,
|
|
&lock);
|
|
mcs_rwlock_writer_unlock_noirq(&child->threads_lock,
|
|
&child_lock);
|
|
goto out_found;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&child->threads_lock,
|
|
&child_lock);
|
|
}
|
|
|
|
if (*empty) {
|
|
list_for_each_entry(child, &proc->ptraced_children_list,
|
|
ptraced_siblings_list) {
|
|
if ((pid < 0 && -pid == child->pgid) ||
|
|
pid == -1 ||
|
|
(pid == 0 && pgid == child->pgid) ||
|
|
(pid > 0 && pid == child->pid)) {
|
|
*empty = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&proc->children_lock, &lock);
|
|
out_found:
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
wait_thread(int tid, int *status, int options, void *rusage, int *empty)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct thread *child, *next;
|
|
int ret = 0;
|
|
struct mcs_rwlock_node lock;
|
|
|
|
mcs_rwlock_writer_lock_noirq(&thread->proc->threads_lock, &lock);
|
|
list_for_each_entry_safe(child, next, &proc->report_threads_list,
|
|
report_siblings_list) {
|
|
if (tid != -1 && child->tid != tid)
|
|
continue;
|
|
if (child == child->proc->main_thread)
|
|
continue;
|
|
*empty = 0;
|
|
if ((options & WEXITED) &&
|
|
(child->status == PS_EXITED ||
|
|
child->status == PS_ZOMBIE)) {
|
|
ret = child->tid;
|
|
if (!(options & WNOWAIT)) {
|
|
if (child->ptrace & PT_TRACED) {
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
ptrace_detach_thread(child, 0);
|
|
}
|
|
else {
|
|
list_del(&child->report_siblings_list);
|
|
child->report_proc = NULL;
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
release_thread(child);
|
|
}
|
|
}
|
|
else
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
goto out_found;
|
|
}
|
|
|
|
if (!(child->ptrace & PT_TRACED) &&
|
|
(child->signal_flags & SIGNAL_STOP_STOPPED) &&
|
|
(options & WUNTRACED)) {
|
|
/*
|
|
* Not ptraced and in stopped state and WUNTRACED is
|
|
* specified
|
|
*/
|
|
ret = wait_stopped(thread, child->proc, child, status,
|
|
options);
|
|
if (!(options & WNOWAIT)) {
|
|
child->signal_flags &= ~SIGNAL_STOP_STOPPED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
goto out_found;
|
|
}
|
|
|
|
if ((child->ptrace & PT_TRACED) &&
|
|
(child->status & (PS_STOPPED | PS_TRACED))) {
|
|
ret = wait_stopped(thread, child->proc, child, status,
|
|
options);
|
|
if (ret == child->tid) {
|
|
/* Are we looking for a specific thread? */
|
|
if (!(options & WNOWAIT)) {
|
|
child->signal_flags &=
|
|
~SIGNAL_STOP_STOPPED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
goto out_found;
|
|
}
|
|
}
|
|
|
|
if ((child->signal_flags & SIGNAL_STOP_CONTINUED) &&
|
|
(options & WCONTINUED)) {
|
|
ret = wait_continued(thread, child->proc, child, status,
|
|
options);
|
|
if (!(options & WNOWAIT)) {
|
|
child->signal_flags &= ~SIGNAL_STOP_CONTINUED;
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(
|
|
&thread->proc->threads_lock, &lock);
|
|
goto out_found;
|
|
}
|
|
}
|
|
|
|
if (*empty) {
|
|
list_for_each_entry(child, &proc->threads_list,
|
|
siblings_list) {
|
|
if (child == child->proc->main_thread)
|
|
continue;
|
|
if (child->termsig && child->termsig != SIGCHLD) {
|
|
*empty = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&thread->proc->threads_lock, &lock);
|
|
out_found:
|
|
return ret;
|
|
}
|
|
|
|
/*
|
|
* From glibc: INLINE_SYSCALL (wait4, 4, pid, stat_loc, options, NULL);
|
|
*/
|
|
static int
|
|
do_wait(int pid, int *status, int options, void *rusage)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
int ret;
|
|
struct waitq_entry waitpid_wqe;
|
|
int empty = 1;
|
|
int orgpid = pid;
|
|
|
|
dkprintf("wait4(): current->proc->pid: %d, pid: %d\n",
|
|
thread->proc->pid, pid);
|
|
|
|
rescan:
|
|
waitq_init_entry(&waitpid_wqe, thread);
|
|
waitq_prepare_to_wait(&thread->proc->waitpid_q, &waitpid_wqe,
|
|
PS_INTERRUPTIBLE);
|
|
pid = orgpid;
|
|
|
|
if (!(options & __WCLONE)) {
|
|
if ((ret = wait_proc(pid, status, options, rusage, &empty))) {
|
|
goto out_found;
|
|
}
|
|
}
|
|
if ((pid == -1 || pid > 0) &&
|
|
(options & (__WCLONE | __WALL))) {
|
|
if ((ret = wait_thread(pid, status, options, rusage, &empty))) {
|
|
goto out_found;
|
|
}
|
|
}
|
|
|
|
if (empty) {
|
|
ret = -ECHILD;
|
|
goto out_notfound;
|
|
}
|
|
|
|
/* Don't sleep if WNOHANG requested */
|
|
if (options & WNOHANG) {
|
|
*status = 0;
|
|
ret = 0;
|
|
goto out_notfound;
|
|
}
|
|
|
|
/* Sleep */
|
|
dkprintf("wait4,sleeping\n");
|
|
|
|
if(hassigpending(thread)){
|
|
waitq_finish_wait(&thread->proc->waitpid_q, &waitpid_wqe);
|
|
return -EINTR;
|
|
}
|
|
|
|
schedule();
|
|
dkprintf("wait4(): woken up\n");
|
|
|
|
waitq_finish_wait(&thread->proc->waitpid_q, &waitpid_wqe);
|
|
|
|
goto rescan;
|
|
|
|
exit:
|
|
waitq_finish_wait(&thread->proc->waitpid_q, &waitpid_wqe);
|
|
return ret;
|
|
out_found:
|
|
dkprintf("wait4,out_found\n");
|
|
goto exit;
|
|
out_notfound:
|
|
dkprintf("wait4,out_notfound\n");
|
|
goto exit;
|
|
}
|
|
|
|
SYSCALL_DECLARE(wait4)
|
|
{
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
int *status = (int *)ihk_mc_syscall_arg1(ctx);
|
|
int options = (int)ihk_mc_syscall_arg2(ctx);
|
|
void *rusage = (void *)ihk_mc_syscall_arg3(ctx);
|
|
int st;
|
|
int rc;
|
|
struct rusage usage;
|
|
|
|
if(options & ~(WNOHANG | WUNTRACED | WCONTINUED | __WCLONE | __WALL)){
|
|
dkprintf("wait4: unexpected options(%x).\n", options);
|
|
return -EINVAL;
|
|
}
|
|
memset(&usage, '\0', sizeof usage);
|
|
rc = do_wait(pid, &st, WEXITED | options, &usage);
|
|
if(rc >= 0 && status)
|
|
copy_to_user(status, &st, sizeof(int));
|
|
if (rusage)
|
|
copy_to_user(rusage, &usage, sizeof usage);
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(waitid)
|
|
{
|
|
int idtype = (int)ihk_mc_syscall_arg0(ctx);
|
|
int id = (int)ihk_mc_syscall_arg1(ctx);
|
|
siginfo_t *infop = (siginfo_t *)ihk_mc_syscall_arg2(ctx);
|
|
int options = (int)ihk_mc_syscall_arg3(ctx);
|
|
int pid;
|
|
int status;
|
|
int rc;
|
|
struct rusage usage;
|
|
|
|
if(idtype == P_PID)
|
|
pid = id;
|
|
else if(idtype == P_PGID)
|
|
pid = -id;
|
|
else if(idtype == P_ALL)
|
|
pid = -1;
|
|
else
|
|
return -EINVAL;
|
|
if(options & ~(WEXITED | WSTOPPED | WCONTINUED | WNOHANG | WNOWAIT | __WCLONE | __WALL)){
|
|
dkprintf("wait4: unexpected options(%x).\n", options);
|
|
dkprintf("waitid: unexpected options(%x).\n", options);
|
|
return -EINVAL;
|
|
}
|
|
if(!(options & (WEXITED | WSTOPPED | WCONTINUED))){
|
|
dkprintf("waitid: no waiting status(%x).\n", options);
|
|
return -EINVAL;
|
|
}
|
|
memset(&usage, '\0', sizeof usage);
|
|
rc = do_wait(pid, &status, options, &usage);
|
|
if(rc < 0)
|
|
return rc;
|
|
if(rc && infop){
|
|
siginfo_t info;
|
|
memset(&info, '\0', sizeof(siginfo_t));
|
|
info.si_signo = SIGCHLD;
|
|
info._sifields._sigchld.si_pid = rc;
|
|
info._sifields._sigchld.si_status = status;
|
|
info._sifields._sigchld.si_utime =
|
|
timeval_to_jiffy(&usage.ru_utime);
|
|
info._sifields._sigchld.si_stime =
|
|
timeval_to_jiffy(&usage.ru_stime);
|
|
if((status & 0x000000ff) == 0x0000007f)
|
|
info.si_code = CLD_STOPPED;
|
|
else if((status & 0x0000ffff) == 0x0000ffff)
|
|
info.si_code = CLD_CONTINUED;
|
|
else if(status & 0x000000ff)
|
|
info.si_code = CLD_KILLED;
|
|
else
|
|
info.si_code = CLD_EXITED;
|
|
copy_to_user(infop, &info, sizeof info);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void terminate_mcexec(int rc, int sig)
|
|
{
|
|
unsigned long old_exit_status;
|
|
unsigned long exit_status;
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct process *proc = mythread->proc;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
if ((old_exit_status = proc->group_exit_status) & 0x0000000100000000L)
|
|
return;
|
|
exit_status = 0x0000000100000000L | ((rc & 0x00ff) << 8) | (sig & 0xff);
|
|
if (cmpxchg(&proc->group_exit_status,
|
|
old_exit_status, exit_status) != old_exit_status)
|
|
return;
|
|
if (!proc->nohost) {
|
|
request.number = __NR_exit_group;
|
|
request.args[0] = proc->group_exit_status;
|
|
proc->nohost = 1;
|
|
do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
}
|
|
|
|
void sync_child_event(struct mc_perf_event *event)
|
|
{
|
|
struct mc_perf_event *leader;
|
|
struct mc_perf_event *sub;
|
|
|
|
if (!event)
|
|
return;
|
|
if (!(event->attr.inherit) && (event->pid == 0))
|
|
return;
|
|
|
|
leader = event->group_leader;
|
|
if (leader->pid == 0) {
|
|
leader->child_count_total +=
|
|
ihk_mc_perfctr_read(leader->counter_id);
|
|
}
|
|
else if (leader->pid > 0) {
|
|
uint64_t count = ihk_mc_perfctr_read(leader->counter_id);
|
|
|
|
ihk_atomic64_set(&leader->count, count);
|
|
}
|
|
else
|
|
return; // Error
|
|
|
|
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
|
|
if (event->pid == 0) {
|
|
sub->child_count_total +=
|
|
ihk_mc_perfctr_read(sub->counter_id);
|
|
}
|
|
else if (event->pid > 0) {
|
|
uint64_t count = ihk_mc_perfctr_read(sub->counter_id);
|
|
|
|
ihk_atomic64_set(&sub->count, count);
|
|
}
|
|
}
|
|
}
|
|
|
|
void terminate(int rc, int sig)
|
|
{
|
|
struct resource_set *resource_set = cpu_local_var(resource_set);
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct thread *thread;
|
|
struct process *proc = mythread->proc;
|
|
struct process *child;
|
|
struct process *next;
|
|
struct process *pid1 = resource_set->pid1;
|
|
struct process_vm *vm;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
struct mcs_rwlock_node updatelock;
|
|
struct mcs_rwlock_node childlock;
|
|
struct mcs_rwlock_node childlock1;
|
|
int i;
|
|
int n;
|
|
int *ids = NULL;
|
|
int exit_status;
|
|
struct timespec ats;
|
|
int found;
|
|
|
|
// sync perf info
|
|
if (proc->monitoring_event)
|
|
sync_child_event(proc->monitoring_event);
|
|
|
|
// clean up threads
|
|
mcs_rwlock_writer_lock_noirq(&proc->update_lock, &updatelock);
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock); // conflict clone
|
|
if (proc->status == PS_EXITED) {
|
|
dkprintf("%s: PID: %d, TID: %d PS_EXITED already\n",
|
|
__FUNCTION__, proc->pid, mythread->tid);
|
|
preempt_disable();
|
|
tsc_to_ts(mythread->user_tsc, &ats);
|
|
ts_add(&proc->utime, &ats);
|
|
tsc_to_ts(mythread->system_tsc, &ats);
|
|
ts_add(&proc->stime, &ats);
|
|
mythread->user_tsc = 0;
|
|
mythread->system_tsc = 0;
|
|
mythread->status = PS_EXITED;
|
|
mythread->exit_status = proc->group_exit_status;
|
|
thread_exit_signal(mythread);
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
|
|
release_thread(mythread);
|
|
preempt_enable();
|
|
schedule();
|
|
// no return
|
|
return;
|
|
}
|
|
|
|
dkprintf("%s: PID: %d, TID: %d setting PS_EXITED\n",
|
|
__FUNCTION__, proc->pid, mythread->tid);
|
|
tsc_to_ts(mythread->user_tsc, &ats);
|
|
ts_add(&proc->utime, &ats);
|
|
tsc_to_ts(mythread->system_tsc, &ats);
|
|
ts_add(&proc->stime, &ats);
|
|
mythread->user_tsc = 0;
|
|
mythread->system_tsc = 0;
|
|
exit_status = ((rc & 0x00ff) << 8) | (sig & 0xff);
|
|
proc->group_exit_status = exit_status;
|
|
mythread->exit_status = exit_status;
|
|
proc->status = PS_EXITED;
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
|
|
|
|
#ifdef ENABLE_TOFU
|
|
/* Tofu: cleanup, must be done before mcexec is gone */
|
|
if (proc->enable_tofu) {
|
|
int fd;
|
|
|
|
for (fd = 0; fd < MAX_FD_PDE; ++fd) {
|
|
/* Tofu? */
|
|
if (proc->enable_tofu && proc->fd_pde_data[fd]) {
|
|
extern void tof_utofu_release_fd(struct process *proc, int fd);
|
|
|
|
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
|
|
__func__, fd, proc->fd_path[fd]);
|
|
tof_utofu_release_fd(proc, fd);
|
|
proc->fd_pde_data[fd] = NULL;
|
|
}
|
|
|
|
if (proc->fd_path[fd]) {
|
|
kfree(proc->fd_path[fd]);
|
|
proc->fd_path[fd] = NULL;
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
|
|
terminate_mcexec(rc, sig);
|
|
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
|
|
list_del(&mythread->siblings_list);
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
|
|
mcs_rwlock_reader_lock(&proc->threads_lock, &lock);
|
|
n = 0;
|
|
list_for_each_entry(thread, &proc->threads_list, siblings_list) {
|
|
if (thread != mythread) {
|
|
n++;
|
|
}
|
|
}
|
|
|
|
if (n) {
|
|
ids = kmalloc(sizeof(int) * n, IHK_MC_AP_NOWAIT);
|
|
i = 0;
|
|
if (ids) {
|
|
list_for_each_entry(thread, &proc->threads_list, siblings_list) {
|
|
if (thread != mythread) {
|
|
ids[i] = thread->tid;
|
|
i++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
mcs_rwlock_reader_unlock(&proc->threads_lock, &lock);
|
|
|
|
if (ids) {
|
|
for (i = 0; i < n; i++) {
|
|
do_kill(mythread, proc->pid, ids[i], SIGKILL, NULL, 0);
|
|
}
|
|
kfree(ids);
|
|
ids = NULL;
|
|
}
|
|
|
|
for (;;) {
|
|
__mcs_rwlock_reader_lock(&proc->threads_lock, &lock);
|
|
found = 0;
|
|
list_for_each_entry(thread, &proc->threads_list,
|
|
siblings_list) {
|
|
if (thread->status != PS_EXITED &&
|
|
thread->status != PS_ZOMBIE) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
mcs_rwlock_reader_unlock(&proc->threads_lock, &lock);
|
|
if (!found) {
|
|
break;
|
|
}
|
|
|
|
/* We might be waiting for another thread on same CPU */
|
|
schedule();
|
|
}
|
|
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
|
|
list_add_tail(&mythread->siblings_list, &proc->threads_list);
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
|
|
vm = proc->vm;
|
|
|
|
#ifdef ENABLE_TOFU
|
|
if (proc->enable_tofu) {
|
|
extern void tof_utofu_finalize();
|
|
|
|
tof_utofu_finalize();
|
|
}
|
|
#endif
|
|
|
|
free_all_process_memory_range(vm);
|
|
|
|
if (proc->saved_cmdline) {
|
|
kfree(proc->saved_cmdline);
|
|
}
|
|
|
|
while (!list_empty(&proc->report_threads_list)) {
|
|
struct thread *thr;
|
|
|
|
thr = list_first_entry(&proc->report_threads_list,
|
|
struct thread, report_siblings_list);
|
|
if (thr->ptrace) {
|
|
int release_flag = thr->proc == proc &&
|
|
thr->termsig &&
|
|
thr->termsig != SIGCHLD;
|
|
|
|
if (release_flag) {
|
|
thr->termsig = 0;
|
|
}
|
|
ptrace_detach_thread(thr, 0);
|
|
if (release_flag) {
|
|
release_thread(thr);
|
|
}
|
|
}
|
|
else {
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
|
|
list_del(&thr->report_siblings_list);
|
|
thr->report_proc = NULL;
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
release_thread(thr);
|
|
}
|
|
}
|
|
|
|
if (!list_empty(&proc->children_list) ||
|
|
!list_empty(&proc->ptraced_children_list)) {
|
|
// clean up children
|
|
for (i = 0; i < HASH_SIZE; i++) {
|
|
mcs_rwlock_writer_lock(&resource_set->process_hash->lock[i],
|
|
&lock);
|
|
list_for_each_entry_safe(child, next,
|
|
&resource_set->process_hash->list[i],
|
|
hash_list) {
|
|
int free_child = 0;
|
|
mcs_rwlock_writer_lock_noirq(&child->update_lock,
|
|
&updatelock);
|
|
|
|
if (child->ppid_parent == proc &&
|
|
child->status == PS_ZOMBIE) {
|
|
list_del_init(&child->hash_list);
|
|
list_del_init(&child->siblings_list);
|
|
free_child = 1;
|
|
}
|
|
else if (child->ppid_parent == proc) {
|
|
mcs_rwlock_writer_lock_noirq(&proc->children_lock,
|
|
&childlock);
|
|
mcs_rwlock_writer_lock_noirq(&pid1->children_lock,
|
|
&childlock1);
|
|
child->ppid_parent = pid1;
|
|
if (child->parent == proc) {
|
|
child->parent = pid1;
|
|
list_del(&child->siblings_list);
|
|
list_add_tail(&child->siblings_list,
|
|
&pid1->children_list);
|
|
}
|
|
else{
|
|
list_del(&child->ptraced_siblings_list);
|
|
list_add_tail(&child->ptraced_siblings_list,
|
|
&pid1->ptraced_children_list);
|
|
}
|
|
mcs_rwlock_writer_unlock_noirq(&pid1->children_lock,
|
|
&childlock1);
|
|
mcs_rwlock_writer_unlock_noirq(&proc->children_lock,
|
|
&childlock);
|
|
}
|
|
|
|
mcs_rwlock_writer_unlock_noirq(&child->update_lock,
|
|
&updatelock);
|
|
|
|
if (free_child)
|
|
release_process(child);
|
|
}
|
|
mcs_rwlock_writer_unlock(&resource_set->process_hash->lock[i],
|
|
&lock);
|
|
}
|
|
}
|
|
|
|
dkprintf("terminate,pid=%d\n", proc->pid);
|
|
|
|
#ifdef DCFA_KMOD
|
|
do_mod_exit(rc);
|
|
#endif
|
|
|
|
// clean up memory
|
|
finalize_process(proc);
|
|
|
|
preempt_disable();
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
|
|
mythread->status = PS_EXITED;
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
release_thread(mythread);
|
|
release_process_vm(vm);
|
|
preempt_enable();
|
|
schedule();
|
|
kprintf("%s: ERROR: returned from terminate() -> schedule()\n", __FUNCTION__);
|
|
panic("panic");
|
|
}
|
|
|
|
int __process_cleanup_fd(struct process *proc, int fd)
|
|
{
|
|
#ifdef ENABLE_TOFU
|
|
/* Tofu? */
|
|
if (proc->enable_tofu) {
|
|
extern void tof_utofu_release_fd(struct process *proc, int fd);
|
|
|
|
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
|
|
__func__, fd, proc->fd_path[fd]);
|
|
tof_utofu_release_fd(proc, fd);
|
|
proc->fd_pde_data[fd] = NULL;
|
|
|
|
if (proc->fd_path[fd]) {
|
|
kfree(proc->fd_path[fd]);
|
|
proc->fd_path[fd] = NULL;
|
|
}
|
|
}
|
|
#endif
|
|
return 0;
|
|
}
|
|
|
|
int process_cleanup_fd(int pid, int fd)
|
|
{
|
|
struct process *proc;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
|
|
proc = find_process(pid, &lock);
|
|
if (!proc) {
|
|
/* This is normal behavior */
|
|
dkprintf("%s: PID %d couldn't be found\n", __func__, pid);
|
|
return 0;
|
|
}
|
|
|
|
__process_cleanup_fd(proc, fd);
|
|
|
|
process_unlock(proc, &lock);
|
|
return 0;
|
|
}
|
|
|
|
int process_cleanup_before_terminate(int pid)
|
|
{
|
|
struct process *proc;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
#ifdef ENABLE_TOFU
|
|
int fd;
|
|
#endif
|
|
|
|
proc = find_process(pid, &lock);
|
|
if (!proc) {
|
|
/* This is normal behavior */
|
|
return 0;
|
|
}
|
|
|
|
#ifdef ENABLE_TOFU
|
|
/* Clean up PDE file descriptors */
|
|
for (fd = 2; fd < MAX_FD_PDE; ++fd) {
|
|
__process_cleanup_fd(proc, fd);
|
|
}
|
|
#endif
|
|
|
|
process_unlock(proc, &lock);
|
|
return 0;
|
|
}
|
|
|
|
|
|
void
|
|
terminate_host(int pid, struct thread *thread)
|
|
{
|
|
struct process *proc;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
|
|
proc = find_process(pid, &lock);
|
|
if (!proc) {
|
|
if (thread) {
|
|
proc = thread->proc;
|
|
ihk_atomic_set(&thread->refcount, 1);
|
|
release_thread(thread);
|
|
release_process(proc);
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (proc->nohost != 1) {
|
|
proc->nohost = 1;
|
|
process_unlock(proc, &lock);
|
|
do_kill(cpu_local_var(current), pid, -1, SIGKILL, NULL, 0);
|
|
}
|
|
else {
|
|
process_unlock(proc, &lock);
|
|
}
|
|
}
|
|
|
|
void eventfd(int type)
|
|
{
|
|
struct ihk_ikc_channel_desc *syscall_channel;
|
|
struct ikc_scd_packet pckt;
|
|
|
|
syscall_channel = get_cpu_local_var(0)->ikc2linux;
|
|
memset(&pckt, '\0', sizeof pckt);
|
|
pckt.msg = SCD_MSG_EVENTFD;
|
|
pckt.eventfd_type = type;
|
|
ihk_ikc_send(syscall_channel, &pckt, 0);
|
|
}
|
|
|
|
void
|
|
interrupt_syscall(struct thread *thread, int sig)
|
|
{
|
|
ihk_mc_user_context_t ctx;
|
|
long lerror;
|
|
|
|
dkprintf("interrupt_syscall pid=%d tid=%d sig=%d\n", thread->proc->pid,
|
|
thread->tid, sig);
|
|
ihk_mc_syscall_arg0(&ctx) = thread->proc->pid;
|
|
ihk_mc_syscall_arg1(&ctx) = thread->tid;
|
|
ihk_mc_syscall_arg2(&ctx) = sig;
|
|
|
|
lerror = syscall_generic_forwarding(__NR_kill, &ctx);
|
|
if (lerror) {
|
|
kprintf("interrupt_syscall failed. %ld\n", lerror);
|
|
}
|
|
return;
|
|
}
|
|
|
|
SYSCALL_DECLARE(exit_group)
|
|
{
|
|
dkprintf("sys_exit_group,pid=%d\n", cpu_local_var(current)->proc->pid);
|
|
terminate((int)ihk_mc_syscall_arg0(ctx), 0);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void clear_host_pte(uintptr_t addr, size_t len, int holding_memory_range_lock)
|
|
{
|
|
ihk_mc_user_context_t ctx;
|
|
long lerror;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
ihk_mc_syscall_arg0(&ctx) = addr;
|
|
ihk_mc_syscall_arg1(&ctx) = len;
|
|
/* NOTE: 3rd parameter denotes new rpgtable of host process (if not zero) */
|
|
ihk_mc_syscall_arg2(&ctx) = 0;
|
|
|
|
/* #986: Let remote page fault code skip
|
|
read-locking memory_range_lock. It's safe because other writers are warded off
|
|
until the remote PF handling code calls up_write(¤t->mm->mmap_sem) and
|
|
vm_range is consistent when calling this function. */
|
|
if (holding_memory_range_lock) {
|
|
thread->vm->is_memory_range_lock_taken = ihk_mc_get_processor_id();
|
|
}
|
|
lerror = syscall_generic_forwarding(__NR_munmap, &ctx);
|
|
if (holding_memory_range_lock) {
|
|
thread->vm->is_memory_range_lock_taken = -1;
|
|
}
|
|
if (lerror) {
|
|
kprintf("clear_host_pte failed. %ld\n", lerror);
|
|
}
|
|
return;
|
|
}
|
|
|
|
static int set_host_vma(uintptr_t addr, size_t len, int prot, int holding_memory_range_lock)
|
|
{
|
|
ihk_mc_user_context_t ctx;
|
|
long lerror;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
ihk_mc_syscall_arg0(&ctx) = addr;
|
|
ihk_mc_syscall_arg1(&ctx) = len;
|
|
ihk_mc_syscall_arg2(&ctx) = prot;
|
|
|
|
/*
|
|
* XXX: Certain fabric drivers (e.g., the Tofu driver) use read-only
|
|
* mappings for the completion queue on which the kernel driver calls
|
|
* get_user_pages() with FOLL_FORCE and FOLL_WRITE flags requested.
|
|
* get_user_pages() on read-only mappings with FOLL_WRITE, however, only
|
|
* works if the underlying mapping is copy-on-write (i.e., private
|
|
* ANONYMOUS or private file mapping). Because mcexec's address space
|
|
* reservation uses a shared pseudo-file mapping to cover McKernel
|
|
* ANONYMOUS areas, we would need to mark it private so that the condition
|
|
* holds. However, that would cause Linux to COW its pages and map to
|
|
* different physical memory thus make it inconsistent with the original
|
|
* McKernel mapping.
|
|
*
|
|
* For the above reason, we do NOT set the host VMA read-only.
|
|
*/
|
|
return 0;
|
|
|
|
dkprintf("%s: offloading __NR_mprotect\n", __FUNCTION__);
|
|
/* #986: Let remote page fault code skip
|
|
read-locking memory_range_lock. It's safe because other writers are warded off
|
|
until the remote PF handling code calls up_write(¤t->mm->mmap_sem) and
|
|
vm_range is consistent when calling this function. */
|
|
if (holding_memory_range_lock) {
|
|
thread->vm->is_memory_range_lock_taken = ihk_mc_get_processor_id();
|
|
}
|
|
lerror = syscall_generic_forwarding(__NR_mprotect, &ctx);
|
|
if (lerror) {
|
|
kprintf("set_host_vma(%lx,%lx,%x) failed. %ld\n",
|
|
addr, len, prot, lerror);
|
|
goto out;
|
|
}
|
|
|
|
lerror = 0;
|
|
out:
|
|
if (holding_memory_range_lock) {
|
|
thread->vm->is_memory_range_lock_taken = -1;
|
|
}
|
|
return (int)lerror;
|
|
}
|
|
|
|
int do_munmap(void *addr, size_t len, int holding_memory_range_lock)
|
|
{
|
|
int error;
|
|
int ro_freed;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
begin_free_pages_pending();
|
|
error = remove_process_memory_range(cpu_local_var(current)->vm,
|
|
(intptr_t)addr, (intptr_t)addr+len, &ro_freed);
|
|
|
|
/* No host involvement for straight mapping ranges */
|
|
if (thread->proc->straight_va &&
|
|
addr >= thread->proc->straight_va &&
|
|
(addr + len) <=
|
|
(thread->proc->straight_va + thread->proc->straight_len)) {
|
|
goto out;
|
|
}
|
|
|
|
if (error || !ro_freed) {
|
|
clear_host_pte((uintptr_t)addr, len, holding_memory_range_lock);
|
|
}
|
|
else {
|
|
error = set_host_vma((uintptr_t)addr, len, PROT_READ | PROT_WRITE | PROT_EXEC, holding_memory_range_lock);
|
|
if (error) {
|
|
kprintf("sys_munmap:set_host_vma failed. %d\n", error);
|
|
/* through */
|
|
}
|
|
}
|
|
|
|
out:
|
|
finish_free_pages_pending();
|
|
|
|
dkprintf("%s: 0x%lx:%lu, error: %ld\n",
|
|
__FUNCTION__, addr, len, error);
|
|
return error;
|
|
}
|
|
|
|
static int search_free_space(size_t len, int pgshift, uintptr_t *addrp)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
intptr_t addr;
|
|
int error = 0;
|
|
struct vm_range *range;
|
|
size_t pgsize = (size_t)1 << pgshift;
|
|
|
|
dkprintf("%s: len: %lu, pgshift: %d\n",
|
|
__FUNCTION__, len, pgshift);
|
|
|
|
/* try given addr first */
|
|
addr = *addrp;
|
|
if (addr != 0) {
|
|
if ((region->user_end <= addr)
|
|
|| ((region->user_end - len) < addr)) {
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
range = lookup_process_memory_range(thread->vm, addr, addr+len);
|
|
if (range == NULL)
|
|
goto out;
|
|
}
|
|
|
|
addr = region->map_end;
|
|
for (;;) {
|
|
addr = (addr + pgsize - 1) & ~(pgsize - 1);
|
|
if ((region->user_end <= addr)
|
|
|| ((region->user_end - len) < addr)) {
|
|
ekprintf("%s: error: addr 0x%lx is outside the user region\n",
|
|
__FUNCTION__, addr);
|
|
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
range = lookup_process_memory_range(thread->vm, addr, addr+len);
|
|
if (range == NULL) {
|
|
break;
|
|
}
|
|
addr = range->end;
|
|
}
|
|
|
|
region->map_end = addr + len;
|
|
*addrp = addr;
|
|
|
|
out:
|
|
dkprintf("%s: len: %lu, pgshift: %d, addr: 0x%lx\n",
|
|
__FUNCTION__, len, pgshift, addr);
|
|
return error;
|
|
}
|
|
|
|
intptr_t
|
|
do_mmap(const uintptr_t addr0, const size_t len0, const int prot,
|
|
const int flags, const int fd, const off_t off0,
|
|
const int vrf0, void *private_data)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
uintptr_t addr = addr0;
|
|
size_t len = len0;
|
|
size_t populate_len = 0;
|
|
off_t off;
|
|
int error = 0;
|
|
intptr_t npages = 0;
|
|
int p2align;
|
|
void *p = NULL;
|
|
int vrflags = VR_NONE;
|
|
uintptr_t phys;
|
|
intptr_t straight_phys;
|
|
struct memobj *memobj = NULL;
|
|
int maxprot;
|
|
int denied;
|
|
int ro_vma_mapped = 0;
|
|
struct shmid_ds ads;
|
|
int populated_mapping = 0;
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *fdp = NULL;
|
|
int pgshift;
|
|
struct vm_range *range = NULL;
|
|
|
|
dkprintf("do_mmap(%lx,%lx,%x,%x,%d,%lx)\n",
|
|
addr0, len0, prot, flags, fd, off0);
|
|
|
|
if (!(flags & MAP_ANONYMOUS)) {
|
|
ihk_mc_spinlock_lock_noirq(&proc->mckfd_lock);
|
|
for(fdp = proc->mckfd; fdp; fdp = fdp->next)
|
|
if(fdp->fd == fd)
|
|
break;
|
|
ihk_mc_spinlock_unlock_noirq(&proc->mckfd_lock);
|
|
|
|
if(fdp){
|
|
ihk_mc_user_context_t ctx;
|
|
|
|
memset(&ctx, '\0', sizeof ctx);
|
|
ihk_mc_syscall_arg0(&ctx) = addr0;
|
|
ihk_mc_syscall_arg1(&ctx) = len0;
|
|
ihk_mc_syscall_arg2(&ctx) = prot;
|
|
ihk_mc_syscall_arg3(&ctx) = flags;
|
|
ihk_mc_syscall_arg4(&ctx) = fd;
|
|
ihk_mc_syscall_arg5(&ctx) = off0;
|
|
|
|
if(fdp->mmap_cb){
|
|
return fdp->mmap_cb(fdp, &ctx);
|
|
}
|
|
return -EBADF;
|
|
}
|
|
}
|
|
|
|
flush_nfo_tlb();
|
|
|
|
/* Initialize straight large memory mapping */
|
|
if (proc->straight_map && !proc->straight_va) {
|
|
unsigned long straight_pa_start = 0xFFFFFFFFFFFFFFFF;
|
|
unsigned long straight_pa_end = 0;
|
|
int i;
|
|
int p2align = PAGE_P2ALIGN;
|
|
size_t psize = PAGE_SIZE;
|
|
unsigned long vrflags;
|
|
enum ihk_mc_pt_attribute ptattr;
|
|
struct vm_range *range;
|
|
|
|
vrflags = PROT_TO_VR_FLAG(PROT_READ | PROT_WRITE);
|
|
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
|
|
for (i = 0; i < ihk_mc_get_nr_memory_chunks(); ++i) {
|
|
unsigned long start, end;
|
|
|
|
ihk_mc_get_memory_chunk(i, &start, &end, NULL);
|
|
|
|
if (straight_pa_start > start) {
|
|
straight_pa_start = start;
|
|
}
|
|
|
|
if (straight_pa_end < end) {
|
|
straight_pa_end = end;
|
|
}
|
|
}
|
|
|
|
kprintf("%s: straight_pa_start: 0x%lx, straight_pa_end: 0x%lx\n",
|
|
__FUNCTION__, straight_pa_start, straight_pa_end);
|
|
|
|
error = arch_get_smaller_page_size(NULL,
|
|
straight_pa_end - straight_pa_start,
|
|
&psize, &p2align);
|
|
|
|
if (error) {
|
|
kprintf("%s: arch_get_smaller_page_size failed: %d\n",
|
|
__FUNCTION__, error);
|
|
goto straight_out;
|
|
}
|
|
//psize = PTL2_SIZE;
|
|
//p2align = PTL2_SHIFT - PTL1_SHIFT;
|
|
|
|
// Force 512G page
|
|
//psize = (1UL << 39);
|
|
//p2align = 39 - PAGE_SHIFT;
|
|
|
|
// Force 512MB page
|
|
psize = (1UL << 29);
|
|
p2align = 29 - PAGE_SHIFT;
|
|
|
|
kprintf("%s: using page shift: %d, psize: %lu\n",
|
|
__FUNCTION__, p2align + PAGE_SHIFT, psize);
|
|
|
|
straight_pa_start &= ~(psize - 1);
|
|
straight_pa_end = (straight_pa_end + psize - 1) & ~(psize - 1);
|
|
|
|
kprintf("%s: aligned straight_pa_start: 0x%lx, straight_pa_end: 0x%lx\n",
|
|
__FUNCTION__, straight_pa_start, straight_pa_end);
|
|
|
|
proc->straight_len = straight_pa_end - straight_pa_start;
|
|
error = search_free_space(proc->straight_len,
|
|
PAGE_SHIFT + p2align, (uintptr_t *)&proc->straight_va);
|
|
|
|
if (error) {
|
|
kprintf("%s: search_free_space() failed: %d\n",
|
|
__FUNCTION__, error);
|
|
proc->straight_va = 0;
|
|
goto straight_out;
|
|
}
|
|
|
|
dkprintf("%s: straight_va: 0x%lx to be used\n",
|
|
__FUNCTION__, proc->straight_va);
|
|
|
|
if (add_process_memory_range(proc->vm, (unsigned long)proc->straight_va,
|
|
(unsigned long)proc->straight_va + proc->straight_len,
|
|
NOPHYS, vrflags, NULL, 0,
|
|
PAGE_SHIFT + p2align, private_data, &range) != 0) {
|
|
kprintf("%s: error: adding straight memory range \n",
|
|
__FUNCTION__);
|
|
proc->straight_va = 0;
|
|
goto straight_out;
|
|
}
|
|
|
|
kprintf("%s: straight_va: 0x%lx, range->pgshift: %d, range OK\n",
|
|
__FUNCTION__, proc->straight_va, range->pgshift);
|
|
|
|
ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
if (1) { // Un-safe mapping of covering physical range
|
|
#endif
|
|
error = ihk_mc_pt_set_range(proc->vm->address_space->page_table,
|
|
proc->vm,
|
|
(void *)range->start,
|
|
(void *)range->end,
|
|
straight_pa_start, ptattr,
|
|
range->pgshift,
|
|
range, 0);
|
|
|
|
if (error) {
|
|
kprintf("%s: ihk_mc_pt_set_range() failed: %d\n",
|
|
__FUNCTION__, error);
|
|
proc->straight_va = 0;
|
|
goto straight_out;
|
|
}
|
|
//ihk_mc_pt_print_pte(proc->vm->address_space->page_table, range->start);
|
|
|
|
region->map_end = (unsigned long)proc->straight_va + proc->straight_len;
|
|
proc->straight_pa = straight_pa_start;
|
|
kprintf("%s: straight mapping: 0x%lx:%lu @ 0x%lx, "
|
|
"psize: %lu, straight_map_threshold: %lu\n",
|
|
__FUNCTION__,
|
|
proc->straight_va,
|
|
proc->straight_len,
|
|
proc->straight_pa,
|
|
psize,
|
|
proc->straight_map_threshold);
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
}
|
|
else { // Safe mapping of only LWK memory ranges
|
|
size_t max_pgsize = 0;
|
|
size_t min_pgsize = 0xFFFFFFFFFFFFFFFF;
|
|
|
|
/*
|
|
* Iterate LWK phsyical memory chunks and map them to their
|
|
* corresponding offset in the straight range using the largest
|
|
* suitable pages.
|
|
*/
|
|
for (i = 0; i < ihk_mc_get_nr_memory_chunks(); ++i) {
|
|
unsigned long start, end, pa;
|
|
void *va, *va_end;
|
|
size_t pgsize;
|
|
int pg2align;
|
|
|
|
ihk_mc_get_memory_chunk(i, &start, &end, NULL);
|
|
va = proc->straight_va + (start - straight_pa_start);
|
|
va_end = va + (end - start);
|
|
pa = start;
|
|
|
|
while (va < va_end) {
|
|
pgsize = (va_end - va) + 1;
|
|
retry:
|
|
error = arch_get_smaller_page_size(NULL, pgsize,
|
|
&pgsize, &pg2align);
|
|
if (error) {
|
|
ekprintf("%s: arch_get_smaller_page_size() failed"
|
|
" during straight mapping: %d\n",
|
|
__func__, error);
|
|
proc->straight_va = 0;
|
|
goto straight_out;
|
|
}
|
|
|
|
/* Are virtual or physical not page aligned for this size? */
|
|
if (((unsigned long)va & (pgsize - 1)) ||
|
|
(pa & (pgsize - 1))) {
|
|
goto retry;
|
|
}
|
|
|
|
error = ihk_mc_pt_set_range(
|
|
proc->vm->address_space->page_table,
|
|
proc->vm,
|
|
va,
|
|
va + pgsize,
|
|
pa,
|
|
ptattr,
|
|
pg2align + PAGE_SHIFT,
|
|
range,
|
|
0);
|
|
|
|
if (error) {
|
|
kprintf("%s: ihk_mc_pt_set_range() failed"
|
|
" during straight mapping: %d\n",
|
|
__func__, error);
|
|
proc->straight_va = 0;
|
|
goto straight_out;
|
|
}
|
|
|
|
if (pgsize > max_pgsize)
|
|
max_pgsize = pgsize;
|
|
|
|
if (pgsize < min_pgsize)
|
|
min_pgsize = pgsize;
|
|
|
|
va += pgsize;
|
|
pa += pgsize;
|
|
}
|
|
}
|
|
|
|
region->map_end = (unsigned long)proc->straight_va +
|
|
proc->straight_len;
|
|
proc->straight_pa = straight_pa_start;
|
|
kprintf("%s: straight mapping: 0x%lx:%lu @ "
|
|
"min_pgsize: %lu, max_pgsize: %lu\n",
|
|
__FUNCTION__,
|
|
proc->straight_va,
|
|
proc->straight_len,
|
|
min_pgsize,
|
|
max_pgsize);
|
|
}
|
|
#endif
|
|
}
|
|
straight_out:
|
|
|
|
if (flags & MAP_HUGETLB) {
|
|
pgshift = (flags >> MAP_HUGE_SHIFT) & 0x3F;
|
|
if (!pgshift) {
|
|
pgshift = ihk_mc_get_linux_default_huge_page_shift();
|
|
}
|
|
p2align = pgshift - PAGE_SHIFT;
|
|
}
|
|
else if ((((flags & (MAP_PRIVATE | MAP_SHARED))
|
|
&& (flags & MAP_ANONYMOUS))
|
|
|| (vrf0 & VR_XPMEM))
|
|
&& !proc->thp_disable) {
|
|
pgshift = 0; /* transparent huge page */
|
|
p2align = PAGE_P2ALIGN;
|
|
|
|
if (len > PAGE_SIZE) {
|
|
error = arch_get_smaller_page_size(NULL, len+1, NULL, &p2align);
|
|
if (error) {
|
|
ekprintf("do_mmap:arch_get_smaller_page_size failed. %d\n", error);
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
else {
|
|
pgshift = PAGE_SHIFT; /* basic page size */
|
|
p2align = PAGE_P2ALIGN;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
if ((flags & MAP_FIXED) && proc->straight_va &&
|
|
((void *)addr >= proc->straight_va) &&
|
|
((void *)addr + len) <= (proc->straight_va + proc->straight_len)) {
|
|
kprintf("%s: can't map MAP_FIXED into straight mapping\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (flags & MAP_FIXED) {
|
|
/* clear specified address range */
|
|
error = do_munmap((void *)addr, len, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ekprintf("do_mmap:do_munmap(%lx,%lx) failed. %d\n",
|
|
addr, len, error);
|
|
goto out;
|
|
}
|
|
}
|
|
else if (flags & MAP_ANONYMOUS) {
|
|
/* Obtain mapping address */
|
|
error = search_free_space(len,
|
|
PAGE_SHIFT + p2align, &addr);
|
|
if (error) {
|
|
kprintf("%s: error: search_free_space(%lx,%lx,%lx) failed. %d\n",
|
|
__func__, len, PAGE_SHIFT + p2align, addr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* do the map */
|
|
vrflags = VR_NONE;
|
|
vrflags |= vrf0;
|
|
vrflags |= PROT_TO_VR_FLAG(prot);
|
|
vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0;
|
|
vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0;
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
if (flags & MAP_ANONYMOUS && !anon_on_demand) {
|
|
if (flags & MAP_PRIVATE) {
|
|
vrflags &= ~VR_DEMAND_PAGING;
|
|
}
|
|
}
|
|
|
|
if (flags & (MAP_POPULATE | MAP_LOCKED)) {
|
|
dkprintf("%s: 0x%lx:%lu %s%s|\n",
|
|
__func__, addr, len,
|
|
flags & MAP_POPULATE ? "|MAP_POPULATE" : "",
|
|
flags & MAP_LOCKED ? "|MAP_LOCKED" : "");
|
|
populated_mapping = 1;
|
|
}
|
|
|
|
#if 0
|
|
/* XXX: Intel MPI 128MB mapping.. */
|
|
if (len == 134217728) {
|
|
dkprintf("%s: %ld bytes mapping -> no prefault\n",
|
|
__FUNCTION__, len);
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
populated_mapping = 0;
|
|
}
|
|
#endif
|
|
|
|
if ((flags & MAP_ANONYMOUS) && !(prot & PROT_WRITE)) {
|
|
error = set_host_vma(addr, len, PROT_READ | PROT_EXEC, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
kprintf("do_mmap:set_host_vma failed. %d\n", error);
|
|
goto out;
|
|
}
|
|
|
|
ro_vma_mapped = 1;
|
|
}
|
|
|
|
phys = 0;
|
|
straight_phys = 0;
|
|
off = 0;
|
|
maxprot = PROT_READ | PROT_WRITE | PROT_EXEC;
|
|
if (!(flags & MAP_ANONYMOUS)) {
|
|
off = off0;
|
|
error = fileobj_create(fd, &memobj, &maxprot,
|
|
flags, addr0);
|
|
if (memobj && memobj->path && !strncmp(memobj->path, "/dev/shm/ucx_posix", 18)) {
|
|
kprintf("%s: mmap flags: %lx, path: %s, memobj->flags: %lx, "
|
|
"pgshift: %d, p2align: %d -> FIXING page size\n",
|
|
__func__, flags, memobj->path, memobj->flags, pgshift, p2align);
|
|
pgshift = PAGE_SHIFT;
|
|
p2align = PAGE_P2ALIGN;
|
|
populated_mapping = 1;
|
|
}
|
|
#ifdef ATTACHED_MIC
|
|
/*
|
|
* XXX: refuse device mapping in attached-mic now:
|
|
*
|
|
* In attached-mic, ihk_mc_map_memory() cannot convert into a local
|
|
* physical address a remote physical address which point KNC's memory.
|
|
* It seems that ihk_mc_map_memory() needs to set up SMPT.
|
|
*/
|
|
if (error == -ESRCH) {
|
|
error = -ENODEV;
|
|
}
|
|
#endif
|
|
#ifdef PROFILE_ENABLE
|
|
if (!error) {
|
|
profile_event_add(PROFILE_mmap_regular_file, len);
|
|
}
|
|
#endif // PROFILE_ENABLE
|
|
if (error == -ESRCH) {
|
|
int populate_flags = 0;
|
|
|
|
dkprintf("do_mmap:hit non VREG\n");
|
|
/*
|
|
* XXX: temporary:
|
|
*
|
|
* device mappings are uncachable
|
|
* until memory type setting codes are implemented.
|
|
*/
|
|
if (1) {
|
|
vrflags &= ~VR_MEMTYPE_MASK;
|
|
vrflags |= VR_MEMTYPE_UC;
|
|
}
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
#ifdef ENABLE_TOFU
|
|
if (!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
|
|
thread->proc->fd_path[fd], 30)) {
|
|
dkprintf("%s: MAP_POPULATE | MAP_LOCKED for %s\n",
|
|
__func__, thread->proc->fd_path[fd]);
|
|
populate_flags = (MAP_POPULATE | MAP_LOCKED);
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
error = devobj_create(fd, len, off, &memobj, &maxprot,
|
|
prot,
|
|
populate_flags | (flags & (MAP_POPULATE | MAP_LOCKED)));
|
|
|
|
if (!error) {
|
|
#ifdef PROFILE_ENABLE
|
|
profile_event_add(PROFILE_mmap_device_file, len);
|
|
#endif // PROFILE_ENABLE
|
|
if (memobj->path &&
|
|
(!strncmp("/tmp/ompi.", memobj->path, 10) ||
|
|
!strncmp("/dev/shm/", memobj->path, 9))) {
|
|
pgshift = PAGE_SHIFT;
|
|
p2align = PAGE_P2ALIGN;
|
|
populated_mapping = 1;
|
|
}
|
|
}
|
|
}
|
|
if (error) {
|
|
kprintf("%s: error: file mapping failed, fd: %d, error: %d\n",
|
|
__func__, fd, error);
|
|
goto out;
|
|
}
|
|
|
|
/* hugetlbfs files are pre-created in fileobj_create, but
|
|
* need extra processing
|
|
*/
|
|
if (memobj && (memobj->flags & MF_HUGETLBFS)) {
|
|
error = hugefileobj_create(memobj, len, off, &pgshift,
|
|
addr0);
|
|
if (error) {
|
|
memobj->ops->free(memobj);
|
|
kprintf("%s: error creating hugetlbfs memobj, fd: %d, error: %d\n",
|
|
__func__, fd, error);
|
|
goto out;
|
|
}
|
|
p2align = pgshift - PAGE_SHIFT;
|
|
}
|
|
|
|
/* Obtain mapping address - delayed to use proper p2align */
|
|
if (!(flags & MAP_FIXED))
|
|
error = search_free_space(len, PAGE_SHIFT + p2align,
|
|
&addr);
|
|
if (error) {
|
|
ekprintf("do_mmap:search_free_space(%lx,%lx,%d) failed. %d\n",
|
|
len, region->map_end, p2align, error);
|
|
goto out;
|
|
}
|
|
if (!(prot & PROT_WRITE)) {
|
|
error = set_host_vma(addr, len, PROT_READ | PROT_EXEC,
|
|
1/* holding memory_range_lock */);
|
|
if (error) {
|
|
kprintf("do_mmap:set_host_vma failed. %d\n",
|
|
error);
|
|
goto out;
|
|
}
|
|
|
|
ro_vma_mapped = 1;
|
|
}
|
|
if (memobj->flags & MF_HUGETLBFS) {
|
|
dkprintf("Created hugefileobj %p (%d:%x %llx-%llx, fd %d, pgshift %d)\n",
|
|
memobj, len, off, addr, addr+len, fd, pgshift);
|
|
} else if (memobj->flags & MF_DEV_FILE) {
|
|
dkprintf("%s: device fd: %d off: %lu mapping at %p - %p\n",
|
|
__func__, fd, off, addr, addr + len);
|
|
}
|
|
}
|
|
/* Prepopulated ANONYMOUS mapping */
|
|
else if (!(vrflags & VR_DEMAND_PAGING)
|
|
&& !(flags & MAP_SHARED)
|
|
&& ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) {
|
|
npages = len >> PAGE_SHIFT;
|
|
/* Small allocations mostly benefit from closest RAM,
|
|
* otherwise follow user requested policy */
|
|
unsigned long ap_flag =
|
|
(!(flags & MAP_STACK) && len >= thread->proc->mpol_threshold) ||
|
|
((flags & MAP_STACK) && !(thread->proc->mpol_flags & MPOL_NO_STACK)) ?
|
|
IHK_MC_AP_USER : 0;
|
|
|
|
if (ap_flag) {
|
|
vrflags |= VR_AP_USER;
|
|
}
|
|
|
|
p = ihk_mc_alloc_aligned_pages_user(npages, p2align,
|
|
IHK_MC_AP_NOWAIT | ap_flag, addr0);
|
|
if (p == NULL) {
|
|
dkprintf("%s: warning: failed to allocate %d contiguous pages "
|
|
" (bytes: %lu, pgshift: %d), enabling demand paging\n",
|
|
__FUNCTION__,
|
|
npages, npages * PAGE_SIZE, p2align);
|
|
|
|
/* Give demand paging a chance */
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
populated_mapping = 0;
|
|
|
|
#ifdef PROFILE_ENABLE
|
|
profile_event_add(PROFILE_mmap_anon_no_contig_phys, len);
|
|
#endif // PROFILE_ENABLE
|
|
error = zeroobj_create(&memobj);
|
|
if (error) {
|
|
ekprintf("%s: zeroobj_create failed, error: %d\n",
|
|
__FUNCTION__, error);
|
|
goto out;
|
|
}
|
|
}
|
|
else {
|
|
#ifdef PROFILE_ENABLE
|
|
profile_event_add(PROFILE_mmap_anon_contig_phys, len);
|
|
#endif // PROFILE_ENABLE
|
|
dkprintf("%s: 0x%x:%lu MAP_ANONYMOUS "
|
|
"allocated %d pages, p2align: %lx\n",
|
|
__FUNCTION__, addr, len, npages, p2align);
|
|
phys = virt_to_phys(p);
|
|
}
|
|
}
|
|
else if (flags & MAP_SHARED) {
|
|
dkprintf("%s: MAP_SHARED,flags=%x,len=%ld\n", __FUNCTION__, flags, len);
|
|
memset(&ads, 0, sizeof(ads));
|
|
ads.shm_segsz = len;
|
|
ads.shm_perm.mode = SHM_DEST;
|
|
ads.init_pgshift = PAGE_SHIFT + p2align;
|
|
error = shmobj_create(&ads, &memobj);
|
|
if (error) {
|
|
ekprintf("do_mmap:shmobj_create failed. %d\n", error);
|
|
goto out;
|
|
}
|
|
}
|
|
else {
|
|
dkprintf("%s: anon&demand-paging\n", __FUNCTION__);
|
|
error = zeroobj_create(&memobj);
|
|
if (error) {
|
|
ekprintf("do_mmap:zeroobj_create failed. %d\n", error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) {
|
|
maxprot |= PROT_WRITE;
|
|
}
|
|
denied = prot & ~maxprot;
|
|
if (denied) {
|
|
ekprintf("do_mmap:denied %x. %x %x\n", denied, prot, maxprot);
|
|
error = (denied == PROT_EXEC)? -EPERM: -EACCES;
|
|
goto out;
|
|
}
|
|
vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot));
|
|
|
|
/*
|
|
* Large anonymous non-fix allocations are in straight mapping,
|
|
* pretend demand paging to avoid filling in PTEs
|
|
*/
|
|
if ((flags & MAP_ANONYMOUS) && proc->straight_map &&
|
|
!(flags & MAP_FIXED) && phys) {
|
|
if (len >= proc->straight_map_threshold) {
|
|
dkprintf("%s: range 0x%lx:%lu will be straight, addding VR_DEMAND\n",
|
|
__FUNCTION__, addr, len);
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
straight_phys = phys;
|
|
phys = 0;
|
|
#ifdef PROFILE_ENABLE
|
|
profile_event_add(PROFILE_mmap_anon_straight, len);
|
|
#endif // PROFILE_ENABLE
|
|
}
|
|
else {
|
|
#ifdef PROFILE_ENABLE
|
|
if (cpu_local_var(current)->profile)
|
|
kprintf("%s: contiguous but not straight? len: %lu\n", __func__, len);
|
|
profile_event_add(PROFILE_mmap_anon_not_straight, len);
|
|
#endif // PROFILE_ENABLE
|
|
}
|
|
}
|
|
|
|
error = add_process_memory_range(thread->vm, addr, addr+len, phys,
|
|
vrflags, memobj, off, pgshift, private_data, &range);
|
|
if (error) {
|
|
kprintf("%s: add_process_memory_range failed for 0x%lx:%lu"
|
|
" flags: %lx, vrflags: %lx, pgshift: %d, error: %d\n",
|
|
__FUNCTION__, addr, addr+len,
|
|
flags, vrflags, pgshift, error);
|
|
goto out;
|
|
}
|
|
|
|
/* Update straight mapping start address */
|
|
if (straight_phys) {
|
|
extern int zero_at_free;
|
|
range->straight_start =
|
|
(unsigned long)proc->straight_va +
|
|
(straight_phys - proc->straight_pa);
|
|
#ifndef ENABLE_FUGAKU_HACKS
|
|
dkprintf("%s: range 0x%lx:%lu is straight starting at 0x%lx\n",
|
|
__FUNCTION__, addr, len, range->straight_start);
|
|
#else
|
|
dkprintf("%s: range 0x%lx:%lu is straight starting at 0x%lx"
|
|
" (phys: 0x%lx)\n",
|
|
__FUNCTION__, addr, len, range->straight_start,
|
|
straight_phys);
|
|
#endif
|
|
if (!zero_at_free) {
|
|
memset((void *)phys_to_virt(straight_phys), 0, len);
|
|
}
|
|
}
|
|
|
|
/* Determine pre-populated size */
|
|
populate_len = memobj ? min(len, memobj->size) : len;
|
|
|
|
if (!(flags & MAP_ANONYMOUS)) {
|
|
if (cmpxchg(&memobj->status, MEMOBJ_TO_BE_PREFETCHED,
|
|
MEMOBJ_READY) == MEMOBJ_TO_BE_PREFETCHED) {
|
|
populated_mapping = 1;
|
|
}
|
|
|
|
/* Update PTEs for pre-mapped memory object */
|
|
if ((memobj->flags & MF_PREMAP) &&
|
|
(proc->mpol_flags & MPOL_SHM_PREMAP)) {
|
|
if (memobj->flags & MF_ZEROFILL) {
|
|
int i;
|
|
enum ihk_mc_pt_attribute ptattr;
|
|
ptattr = arch_vrflag_to_ptattr(range->flag, PF_POPULATE, NULL);
|
|
|
|
for (i = 0; i < memobj->nr_pages; ++i) {
|
|
error = ihk_mc_pt_set_range(proc->vm->address_space->page_table,
|
|
proc->vm,
|
|
(void *)range->start + (i * PAGE_SIZE),
|
|
(void *)range->start + (i * PAGE_SIZE) +
|
|
PAGE_SIZE,
|
|
virt_to_phys(memobj->pages[i]),
|
|
ptattr,
|
|
PAGE_SHIFT,
|
|
range,
|
|
0);
|
|
if (error) {
|
|
kprintf("%s: ERROR: mapping %d page of pre-mapped file\n",
|
|
__FUNCTION__, i);
|
|
}
|
|
}
|
|
dkprintf("%s: memobj 0x%lx pre-mapped\n", __FUNCTION__, memobj);
|
|
// fileobj && MF_PREMAP && MPOL_SHM_PREMAP case: memory_stat_rss_add() is called in fileobj_create()
|
|
}
|
|
else {
|
|
populated_mapping = 1;
|
|
}
|
|
}
|
|
/*
|
|
else if (memobj->flags & MF_REG_FILE) {
|
|
populated_mapping = 1;
|
|
populate_len = memobj->size;
|
|
}
|
|
*/
|
|
}
|
|
|
|
error = 0;
|
|
p = NULL;
|
|
memobj = NULL;
|
|
ro_vma_mapped = 0;
|
|
|
|
out:
|
|
if (ro_vma_mapped && !range->straight_start) {
|
|
(void)set_host_vma(addr, len, PROT_READ | PROT_WRITE | PROT_EXEC, 1/* holding memory_range_lock */);
|
|
}
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
ihk_rwspinlock_read_lock_noirq(&thread->vm->memory_range_lock);
|
|
if (!error && range && range->memobj &&
|
|
(range->memobj->flags & MF_XPMEM)) {
|
|
error = xpmem_update_process_page_table(thread->vm, range);
|
|
if (error) {
|
|
ekprintf("%s: xpmem_update_process_page_table(): "
|
|
"vm: %p, range: %lx-%lx failed %d\n",
|
|
__func__, thread->vm,
|
|
range->start, range->end, error);
|
|
}
|
|
}
|
|
ihk_rwspinlock_read_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
if (!error && populated_mapping &&
|
|
!((vrflags & VR_PROT_MASK) == VR_PROT_NONE) && !range->straight_start) {
|
|
error = populate_process_memory(thread->vm,
|
|
(void *)addr, populate_len);
|
|
|
|
if (error) {
|
|
ekprintf("%s: WARNING: populate_process_memory(): "
|
|
"vm: %p, addr: %p, len: %d (flags: %s%s) failed %d\n",
|
|
__FUNCTION__,
|
|
thread->vm, (void *)addr, len,
|
|
(flags & MAP_POPULATE) ? "MAP_POPULATE " : "",
|
|
(flags & MAP_LOCKED) ? "MAP_LOCKED ": "",
|
|
error);
|
|
/*
|
|
* In this case,
|
|
* the mapping established by this call should be unmapped
|
|
* before mmap() returns with error.
|
|
*
|
|
* However, the mapping cannot be unmaped simply,
|
|
* because the mapping can be modified by other thread
|
|
* because memory_range_lock has been released.
|
|
*
|
|
* For the moment, like a linux-2.6.38-8,
|
|
* the physical page allocation failure is ignored.
|
|
*/
|
|
error = 0;
|
|
}
|
|
}
|
|
|
|
if (p && npages > 0) {
|
|
ihk_mc_free_pages_user(p, npages);
|
|
}
|
|
if (memobj) {
|
|
memobj_unref(memobj);
|
|
}
|
|
|
|
#ifndef ENABLE_FUGAKU_HACKS
|
|
dkprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, "
|
|
#else
|
|
if (cpu_local_var(current)->profile) {
|
|
kprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, "
|
|
#endif
|
|
"fd: %d, off: %lu, error: %ld, addr: 0x%lx\n",
|
|
__FUNCTION__,
|
|
addr, len, addr0, len0, prot, flags,
|
|
fd, off0, error, addr);
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
}
|
|
#endif
|
|
|
|
return !error ?
|
|
(range->straight_start ? range->straight_start : addr) :
|
|
error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(munmap)
|
|
{
|
|
const uintptr_t addr = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
size_t len;
|
|
int error;
|
|
|
|
dkprintf("[%d]sys_munmap(%lx,%lx)\n",
|
|
ihk_mc_get_processor_id(), addr, len0);
|
|
|
|
len = (len0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
if ((addr & (PAGE_SIZE - 1))
|
|
|| (addr < region->user_start)
|
|
|| (region->user_end <= addr)
|
|
|| (len == 0)
|
|
|| (len > (region->user_end - region->user_start))
|
|
|| ((region->user_end - len) < addr)) {
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
error = do_munmap((void *)addr, len, 1/* holding memory_range_lock */);
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
out:
|
|
dkprintf("[%d]sys_munmap(%lx,%lx): %d\n",
|
|
ihk_mc_get_processor_id(), addr, len0, error);
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
if (error) {
|
|
kprintf("%s: error: %d\n", __func__, error);
|
|
}
|
|
#endif
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(mprotect)
|
|
{
|
|
const intptr_t start = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
const int prot = ihk_mc_syscall_arg2(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
size_t len;
|
|
intptr_t end;
|
|
struct vm_range *first;
|
|
intptr_t addr;
|
|
struct vm_range *range;
|
|
int error;
|
|
struct vm_range *changed;
|
|
const unsigned long protflags = PROT_TO_VR_FLAG(prot);
|
|
unsigned long denied;
|
|
int ro_changed = 0;
|
|
|
|
dkprintf("[%d]sys_mprotect(%lx,%lx,%x)\n",
|
|
ihk_mc_get_processor_id(), start, len0, prot);
|
|
|
|
len = (len0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
end = start + len;
|
|
|
|
/* check arguments */
|
|
if (start & (PAGE_SIZE - 1)) {
|
|
ekprintf("[%d]sys_mprotect(%lx,%lx,%x): -EINVAL\n",
|
|
ihk_mc_get_processor_id(), start, len0, prot);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if ((start < region->user_start)
|
|
|| (region->user_end <= start)
|
|
|| ((region->user_end - start) < len)) {
|
|
ekprintf("[%d]sys_mprotect(%lx,%lx,%x): -ENOMEM\n",
|
|
ihk_mc_get_processor_id(), start, len0, prot);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (len == 0) {
|
|
/* nothing to do */
|
|
return 0;
|
|
}
|
|
|
|
if (thread->proc->straight_va &&
|
|
((void *)start >= thread->proc->straight_va) &&
|
|
(void *)end <= (thread->proc->straight_va +
|
|
thread->proc->straight_len)) {
|
|
kprintf("%s: ignored for straight mapping 0x%lx\n",
|
|
__FUNCTION__, start);
|
|
error = 0;
|
|
goto out_straight;
|
|
}
|
|
|
|
flush_nfo_tlb();
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
first = lookup_process_memory_range(thread->vm, start, start+PAGE_SIZE);
|
|
|
|
/* do the mprotect */
|
|
changed = NULL;
|
|
for (addr = start; addr < end; addr = changed->end) {
|
|
if (changed == NULL) {
|
|
range = first;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, changed);
|
|
}
|
|
|
|
if ((range == NULL) || (addr < range->start)) {
|
|
/* not contiguous */
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):not contiguous\n",
|
|
start, len0, prot);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
denied = protflags & ~VRFLAG_MAXPROT_TO_PROT(range->flag);
|
|
if (denied) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):denied %lx. %lx %lx\n",
|
|
start, len0, prot, denied, protflags, range->flag);
|
|
error = -EACCES;
|
|
goto out;
|
|
}
|
|
|
|
if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):cannot change\n",
|
|
start, len0, prot);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (range->start < addr) {
|
|
error = split_process_memory_range(thread->vm, range, addr, &range);
|
|
if (error) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):split failed. %d\n",
|
|
start, len0, prot, error);
|
|
goto out;
|
|
}
|
|
}
|
|
if (end < range->end) {
|
|
error = split_process_memory_range(thread->vm, range, end, NULL);
|
|
if (error) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):split failed. %d\n",
|
|
start, len0, prot, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if ((range->flag ^ protflags) & VR_PROT_WRITE) {
|
|
ro_changed = 1;
|
|
}
|
|
|
|
error = change_prot_process_memory_range(thread->vm, range, protflags);
|
|
if (error) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):change failed. %d\n",
|
|
start, len0, prot, error);
|
|
goto out;
|
|
}
|
|
|
|
if (changed == NULL) {
|
|
changed = range;
|
|
}
|
|
else {
|
|
error = join_process_memory_range(thread->vm, changed, range);
|
|
if (error) {
|
|
ekprintf("sys_mprotect(%lx,%lx,%x):join failed. %d\n",
|
|
start, len0, prot, error);
|
|
changed = range;
|
|
/* through */
|
|
}
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
// XXX: TLB flush
|
|
flush_tlb();
|
|
if (ro_changed && !error) {
|
|
error = set_host_vma(start, len, prot & (PROT_READ | PROT_WRITE | PROT_EXEC), 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
kprintf("sys_mprotect:set_host_vma failed. %d\n", error);
|
|
/* through */
|
|
}
|
|
}
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
out_straight:
|
|
dkprintf("[%d]sys_mprotect(%lx,%lx,%x): %d\n",
|
|
ihk_mc_get_processor_id(), start, len0, prot, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(brk)
|
|
{
|
|
unsigned long address = ihk_mc_syscall_arg0(ctx);
|
|
struct vm_regions *region = &cpu_local_var(current)->vm->region;
|
|
unsigned long r;
|
|
unsigned long vrflag;
|
|
unsigned long old_brk_end_allocated = 0;
|
|
|
|
dkprintf("SC(%d)[sys_brk] brk_start=%lx,end=%lx\n",
|
|
ihk_mc_get_processor_id(), region->brk_start, region->brk_end);
|
|
|
|
flush_nfo_tlb();
|
|
|
|
/* brk change fail, including glibc trick brk(0) to obtain current brk */
|
|
if (address < region->brk_start) {
|
|
r = region->brk_end;
|
|
goto out;
|
|
}
|
|
|
|
/* brk change fail, because we don't shrink memory region */
|
|
if (address < region->brk_end) {
|
|
r = region->brk_end;
|
|
goto out;
|
|
}
|
|
|
|
/* If already allocated, just expand and return */
|
|
if (address <= region->brk_end_allocated) {
|
|
region->brk_end = address;
|
|
r = region->brk_end;
|
|
goto out;
|
|
}
|
|
|
|
/* Try to extend memory region */
|
|
vrflag = VR_PROT_READ | VR_PROT_WRITE;
|
|
vrflag |= VR_PRIVATE;
|
|
vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag);
|
|
old_brk_end_allocated = region->brk_end_allocated;
|
|
ihk_rwspinlock_write_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock);
|
|
region->brk_end_allocated =
|
|
extend_process_region(cpu_local_var(current)->vm,
|
|
region->brk_end_allocated, address, vrflag);
|
|
ihk_rwspinlock_write_unlock_noirq(&cpu_local_var(current)->vm->memory_range_lock);
|
|
|
|
if (old_brk_end_allocated == region->brk_end_allocated) {
|
|
r = old_brk_end_allocated;
|
|
goto out;
|
|
}
|
|
|
|
region->brk_end = address;
|
|
r = region->brk_end;
|
|
dkprintf("SC(%d)[sys_brk] brk_end set to %lx\n",
|
|
ihk_mc_get_processor_id(), region->brk_end);
|
|
|
|
out:
|
|
return r;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getpid)
|
|
{
|
|
return cpu_local_var(current)->proc->pid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getppid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
return thread->proc->ppid_parent->pid;
|
|
}
|
|
|
|
static int settid(struct thread *thread, int nr_tids, int *tids)
|
|
{
|
|
int ret;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
memset(&request, 0, sizeof(request));
|
|
|
|
request.number = __NR_gettid;
|
|
/*
|
|
* If nr_tids is non-zero, tids should point to an array of ints
|
|
* where the thread ids of the mcexec process are expected.
|
|
*/
|
|
request.args[4] = nr_tids;
|
|
request.args[5] = virt_to_phys(tids);
|
|
if ((ret = do_syscall(&request, ihk_mc_get_processor_id())) < 0) {
|
|
kprintf("%s: WARNING: do_syscall returns %d\n",
|
|
__FUNCTION__, ret);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(gettid)
|
|
{
|
|
return cpu_local_var(current)->tid;
|
|
}
|
|
|
|
extern void ptrace_report_signal(struct thread *thread, int sig);
|
|
static int ptrace_report_exec(struct thread *thread)
|
|
{
|
|
int ptrace = thread->ptrace;
|
|
|
|
if (ptrace & (PT_TRACE_EXEC|PTRACE_O_TRACEEXEC)) {
|
|
ihk_mc_kernel_context_t ctx;
|
|
int sig = (SIGTRAP | (PTRACE_EVENT_EXEC << 8));
|
|
|
|
memcpy(&ctx, &thread->ctx, sizeof ctx);
|
|
preempt_enable();
|
|
ptrace_report_signal(thread, sig);
|
|
preempt_disable();
|
|
memcpy(&thread->ctx, &ctx, sizeof ctx);
|
|
thread->ptrace |= PT_TRACED_AFTER_EXEC;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
void ptrace_syscall_event(struct thread *thread)
|
|
{
|
|
int ptrace = thread->ptrace;
|
|
|
|
if (ptrace & PT_TRACE_SYSCALL) {
|
|
int sig = (SIGTRAP | ((ptrace & PTRACE_O_TRACESYSGOOD) ? 0x80 : 0));
|
|
ptrace_report_signal(thread, sig);
|
|
}
|
|
}
|
|
|
|
static int ptrace_check_clone_event(struct thread *thread, int clone_flags)
|
|
{
|
|
int event = 0;
|
|
|
|
if (clone_flags & CLONE_VFORK) {
|
|
/* vfork */
|
|
if (thread->ptrace & PTRACE_O_TRACEVFORK) {
|
|
event = PTRACE_EVENT_VFORK;
|
|
}
|
|
if (thread->ptrace & PTRACE_O_TRACEVFORKDONE) {
|
|
event = PTRACE_EVENT_VFORK_DONE;
|
|
}
|
|
} else if ((clone_flags & CSIGNAL) == SIGCHLD) {
|
|
/* fork */
|
|
if (thread->ptrace & PTRACE_O_TRACEFORK) {
|
|
event = PTRACE_EVENT_FORK;
|
|
}
|
|
} else {
|
|
/* clone */
|
|
if (thread->ptrace & PTRACE_O_TRACECLONE) {
|
|
event = PTRACE_EVENT_CLONE;
|
|
}
|
|
}
|
|
|
|
return event;
|
|
}
|
|
|
|
static int ptrace_attach_thread(struct thread *thread, struct process *proc)
|
|
{
|
|
struct process *child;
|
|
struct process *parent;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
int error = 0;
|
|
|
|
if (thread->report_proc) {
|
|
mcs_rwlock_writer_lock(&thread->report_proc->threads_lock,
|
|
&lock);
|
|
list_del(&thread->report_siblings_list);
|
|
mcs_rwlock_writer_unlock(&thread->report_proc->threads_lock,
|
|
&lock);
|
|
}
|
|
|
|
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
|
|
list_add_tail(&thread->report_siblings_list,
|
|
&proc->report_threads_list);
|
|
thread->report_proc = proc;
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
|
|
child = thread->proc;
|
|
if (thread == child->main_thread) {
|
|
parent = child->parent;
|
|
dkprintf("ptrace_attach() parent->pid=%d\n", parent->pid);
|
|
mcs_rwlock_writer_lock(&parent->children_lock, &lock);
|
|
list_del(&child->siblings_list);
|
|
list_add_tail(&child->ptraced_siblings_list,
|
|
&parent->ptraced_children_list);
|
|
mcs_rwlock_writer_unlock(&parent->children_lock, &lock);
|
|
|
|
mcs_rwlock_writer_lock(&proc->children_lock, &lock);
|
|
list_add_tail(&child->siblings_list, &proc->children_list);
|
|
child->parent = proc;
|
|
mcs_rwlock_writer_unlock(&proc->children_lock, &lock);
|
|
}
|
|
|
|
if (thread->ptrace_debugreg == NULL) {
|
|
error = alloc_debugreg(thread);
|
|
if (error < 0) {
|
|
goto out;
|
|
}
|
|
}
|
|
hold_thread(thread);
|
|
|
|
clear_single_step(thread);
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
static int ptrace_report_clone(struct thread *thread, struct thread *new, int event)
|
|
{
|
|
dkprintf("ptrace_report_clone,enter\n");
|
|
int error = 0;
|
|
long rc;
|
|
struct siginfo info;
|
|
struct mcs_rwlock_node lock;
|
|
struct mcs_rwlock_node updatelock;
|
|
int parent_pid;
|
|
|
|
/* Save reason why stopped and process state for wait4() to reap */
|
|
mcs_rwlock_writer_lock_noirq(&thread->proc->update_lock, &lock);
|
|
thread->exit_status = (SIGTRAP | (event << 8));
|
|
/* Transition process state */
|
|
thread->proc->status = PS_TRACED;
|
|
thread->status = PS_TRACED;
|
|
thread->ptrace_eventmsg = new->tid;
|
|
thread->ptrace &= ~PT_TRACE_SYSCALL;
|
|
parent_pid = thread->proc->parent->pid;
|
|
mcs_rwlock_writer_unlock_noirq(&thread->proc->update_lock, &lock);
|
|
|
|
if (event != PTRACE_EVENT_VFORK_DONE) {
|
|
/* PTRACE_EVENT_FORK or PTRACE_EVENT_VFORK or PTRACE_EVENT_CLONE */
|
|
|
|
mcs_rwlock_writer_lock_noirq(&new->proc->update_lock, &updatelock);
|
|
/* set ptrace features to new process */
|
|
new->ptrace = thread->ptrace;
|
|
|
|
ptrace_attach_thread(new, thread->proc->parent);
|
|
|
|
/* trace and SIGSTOP */
|
|
new->exit_status = SIGSTOP;
|
|
new->proc->status = PS_TRACED;
|
|
new->status = PS_TRACED;
|
|
|
|
mcs_rwlock_writer_unlock_noirq(&new->proc->update_lock, &updatelock);
|
|
}
|
|
|
|
dkprintf("ptrace_report_clone,kill SIGCHLD\n");
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = SIGCHLD;
|
|
info.si_code = CLD_TRAPPED;
|
|
info._sifields._sigchld.si_pid = thread->proc->pid;
|
|
info._sifields._sigchld.si_status = thread->exit_status;
|
|
rc = do_kill(cpu_local_var(current), parent_pid, -1, SIGCHLD, &info, 0);
|
|
if(rc < 0) {
|
|
dkprintf("ptrace_report_clone,do_kill failed\n");
|
|
}
|
|
|
|
/* Wake parent (if sleeping in wait4()) */
|
|
waitq_wakeup(&thread->proc->parent->waitpid_q);
|
|
|
|
return error;
|
|
}
|
|
|
|
static void munmap_all(void)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
struct vm_range *range;
|
|
struct vm_range *next;
|
|
void *addr;
|
|
size_t size;
|
|
int error;
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
|
|
next = lookup_process_memory_range(vm, 0, -1);
|
|
while ((range = next)) {
|
|
next = next_process_memory_range(vm, range);
|
|
|
|
addr = (void *)range->start;
|
|
size = range->end - range->start;
|
|
error = do_munmap(addr, size, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
kprintf("munmap_all():do_munmap(%p,%lx) failed. %d\n",
|
|
addr, size, error);
|
|
/* through */
|
|
}
|
|
}
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
|
|
/* free vm_ranges which do_munmap() failed to remove. */
|
|
free_process_memory_ranges(thread->vm);
|
|
|
|
/* reset vm_region's map area */
|
|
thread->vm->region.map_end = thread->vm->region.map_start;
|
|
|
|
return;
|
|
} /* munmap_all() */
|
|
|
|
static int do_execveat(ihk_mc_user_context_t *ctx, int dirfd,
|
|
const char *filename, char **argv, char **envp, int flags)
|
|
{
|
|
int error;
|
|
long ret;
|
|
|
|
char *argv_flat = NULL;
|
|
int argv_flat_len = 0;
|
|
char *envp_flat = NULL;
|
|
int envp_flat_len = 0;
|
|
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct program_load_desc *desc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
struct vm_range *range;
|
|
struct process *proc = thread->proc;
|
|
int i;
|
|
|
|
ihk_rwspinlock_read_lock_noirq(&vm->memory_range_lock);
|
|
|
|
range = lookup_process_memory_range(vm, (unsigned long)filename,
|
|
(unsigned long)filename+1);
|
|
|
|
if (range == NULL || !(range->flag & VR_PROT_READ)) {
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
kprintf("execve(): ERROR: filename is bad address\n");
|
|
return -EFAULT;
|
|
}
|
|
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
|
|
desc = ihk_mc_alloc_pages(4, IHK_MC_AP_NOWAIT);
|
|
if (!desc) {
|
|
kprintf("execve(): ERROR: allocating program descriptor\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
memset((void*)desc, 0, 4 * PAGE_SIZE);
|
|
|
|
/* Request host to open executable and load ELF section descriptions */
|
|
request.number = __NR_execve;
|
|
request.args[0] = 1; /* 1st phase - get ELF desc */
|
|
request.args[1] = dirfd;
|
|
request.args[2] = (unsigned long)filename;
|
|
request.args[3] = virt_to_phys(desc);
|
|
request.args[4] = flags;
|
|
ret = do_syscall(&request, ihk_mc_get_processor_id());
|
|
|
|
if (ret != 0) {
|
|
dkprintf("execve(): ERROR: host failed to load elf header, errno: %d\n",
|
|
ret);
|
|
ret = -ret;
|
|
goto end;
|
|
}
|
|
|
|
dkprintf("execve(): ELF desc received, num sections: %d\n",
|
|
desc->num_sections);
|
|
|
|
/* for shebang script we get extra argvs from mcexec */
|
|
if (desc->args_len) {
|
|
desc->args = ((char *)desc) + sizeof(struct program_load_desc) +
|
|
sizeof(struct program_image_section) *
|
|
desc->num_sections;
|
|
}
|
|
|
|
/* Flatten argv and envp into kernel-space buffers */
|
|
argv_flat_len = flatten_strings_from_user(desc->args, argv,
|
|
&argv_flat);
|
|
if (argv_flat_len < 0) {
|
|
char *kfilename;
|
|
int len = strlen_user(filename);
|
|
|
|
kfilename = kmalloc(len + 1, IHK_MC_AP_NOWAIT);
|
|
if(kfilename)
|
|
strcpy_from_user(kfilename, filename);
|
|
kprintf("ERROR: no argv for executable: %s?\n", kfilename? kfilename: "");
|
|
if(kfilename)
|
|
kfree(kfilename);
|
|
ret = argv_flat_len;
|
|
goto end;
|
|
}
|
|
desc->args = NULL;
|
|
desc->args_len = 0;
|
|
|
|
envp_flat_len = flatten_strings_from_user(NULL, envp, &envp_flat);
|
|
if (envp_flat_len < 0) {
|
|
char *kfilename;
|
|
int len = strlen_user(filename);
|
|
|
|
kfilename = kmalloc(len + 1, IHK_MC_AP_NOWAIT);
|
|
if(kfilename)
|
|
strcpy_from_user(kfilename, filename);
|
|
kprintf("ERROR: no envp for executable: %s?\n", kfilename? kfilename: "");
|
|
if(kfilename)
|
|
kfree(kfilename);
|
|
ret = envp_flat_len;
|
|
goto end;
|
|
}
|
|
|
|
/* Unmap all memory areas of the process, userspace will be gone */
|
|
munmap_all();
|
|
|
|
/* Code assumes no process switch from here on */
|
|
preempt_disable();
|
|
ihk_mc_init_user_process(&thread->ctx, &thread->uctx,
|
|
((char *)thread) +
|
|
KERNEL_STACK_NR_PAGES * PAGE_SIZE, desc->entry, 0);
|
|
|
|
/* map_start / map_end is used to track memory area
|
|
* to which the program is loaded
|
|
*/
|
|
vm->region.map_start = vm->region.map_end = LD_TASK_UNMAPPED_BASE;
|
|
|
|
/* Create virtual memory ranges and update args/envs */
|
|
if ((ret = prepare_process_ranges_args_envs(thread, desc, desc,
|
|
PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER,
|
|
argv_flat, argv_flat_len, envp_flat, envp_flat_len)) != 0) {
|
|
kprintf("execve(): ERROR: preparing ranges, args, envs, stack, ret: %d\n",
|
|
ret);
|
|
preempt_enable();
|
|
/* control can't be rolled back because vm_range is gone */
|
|
do_kill(thread, thread->proc->pid, thread->tid, SIGKILL, NULL, 0);
|
|
goto end;
|
|
}
|
|
|
|
/* Clear host user space PTEs */
|
|
clear_host_pte(vm->region.user_start,
|
|
(vm->region.user_end - vm->region.user_start), 0);
|
|
|
|
/* Request host to transfer ELF image */
|
|
request.number = __NR_execve;
|
|
request.args[0] = 2; /* 2nd phase - transfer ELF image */
|
|
request.args[1] = virt_to_phys(desc);
|
|
request.args[2] = sizeof(struct program_load_desc) +
|
|
sizeof(struct program_image_section) * desc->num_sections;
|
|
|
|
if ((ret = do_syscall(&request, ihk_mc_get_processor_id())) != 0) {
|
|
preempt_enable();
|
|
/* control can't be rolled back because vm_range is gone */
|
|
do_kill(thread, thread->proc->pid, thread->tid, SIGKILL, NULL, 0);
|
|
goto end;
|
|
}
|
|
|
|
for(i = 0; i < _NSIG; i++){
|
|
if(thread->sigcommon->action[i].sa.sa_handler != SIG_IGN &&
|
|
thread->sigcommon->action[i].sa.sa_handler != SIG_DFL)
|
|
thread->sigcommon->action[i].sa.sa_handler = SIG_DFL;
|
|
}
|
|
|
|
/* Reset floating-point environment to default. */
|
|
clear_fp_regs();
|
|
|
|
/* Reset sigaltstack to default */
|
|
thread->sigstack.ss_sp = NULL;
|
|
thread->sigstack.ss_flags = SS_DISABLE;
|
|
thread->sigstack.ss_size = 0;
|
|
|
|
error = ptrace_report_exec(thread);
|
|
if (error) {
|
|
kprintf("execve(): ERROR: ptrace_report_exec()\n");
|
|
}
|
|
|
|
/* Switch to new execution context */
|
|
dkprintf("execve(): switching to new process\n");
|
|
proc->execed = 1;
|
|
|
|
ret = 0;
|
|
end:
|
|
if (envp_flat) {
|
|
kfree(envp_flat);
|
|
}
|
|
if (argv_flat) {
|
|
kfree(argv_flat);
|
|
}
|
|
ihk_mc_free_pages(desc, 4);
|
|
|
|
if (!ret) {
|
|
unsigned long irqstate;
|
|
|
|
/* Lock run queue because enter_user_mode expects to release it */
|
|
irqstate = cpu_disable_interrupt_save();
|
|
ihk_mc_spinlock_lock_noirq(
|
|
&(get_this_cpu_local_var()->runq_lock));
|
|
cpu_local_var(runq_irqstate) = irqstate;
|
|
preempt_enable();
|
|
|
|
ihk_mc_switch_context(NULL, &thread->ctx, thread);
|
|
|
|
/* not reached */
|
|
return -EFAULT;
|
|
}
|
|
|
|
/* no preempt_enable, errors can only happen before we disabled it */
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(execve)
|
|
{
|
|
return do_execveat(ctx, AT_FDCWD,
|
|
(const char *)ihk_mc_syscall_arg0(ctx),
|
|
(char **)ihk_mc_syscall_arg1(ctx),
|
|
(char **)ihk_mc_syscall_arg2(ctx), 0);
|
|
}
|
|
|
|
unsigned long do_fork(int clone_flags, unsigned long newsp,
|
|
unsigned long parent_tidptr, unsigned long child_tidptr,
|
|
unsigned long tlsblock_base, unsigned long curpc,
|
|
unsigned long cursp)
|
|
{
|
|
int cpuid;
|
|
int parent_cpuid;
|
|
struct thread *old = cpu_local_var(current);
|
|
struct process *oldproc = old->proc;
|
|
struct process *newproc;
|
|
struct thread *new;
|
|
struct syscall_request request1 IHK_DMA_ALIGN;
|
|
int ptrace_event = 0;
|
|
int termsig = clone_flags & 0x000000ff;
|
|
#if 0
|
|
const struct ihk_mc_cpu_info *cpu_info = ihk_mc_get_cpu_info();
|
|
#endif
|
|
int err = 0;
|
|
unsigned long clone_pthread_start_routine = 0;
|
|
struct vm_range *range = NULL;
|
|
int helper_thread = 0;
|
|
|
|
dkprintf("%s,flags=%08x,newsp=%lx,ptidptr=%lx,"
|
|
"ctidptr=%lx,tls=%lx,curpc=%lx,cursp=%lx",
|
|
__func__, clone_flags, newsp, parent_tidptr,
|
|
child_tidptr, tlsblock_base, curpc, cursp);
|
|
|
|
dkprintf("do_fork(): stack_pointr passed in: 0x%lX, stack pointer of caller: 0x%lx\n",
|
|
newsp, cursp);
|
|
|
|
/* CLONE_VM and newsp == parent_tidptr impiles pthread start routine addr */
|
|
if ((clone_flags & CLONE_VM) && newsp == parent_tidptr) {
|
|
old->clone_pthread_start_routine = parent_tidptr;
|
|
dkprintf("%s: clone_pthread_start_routine: 0x%lx\n", __func__,
|
|
old->clone_pthread_start_routine);
|
|
return 0;
|
|
}
|
|
|
|
/* Clear pthread routine addr regardless if we succeed */
|
|
clone_pthread_start_routine = old->clone_pthread_start_routine;
|
|
old->clone_pthread_start_routine = 0;
|
|
|
|
parent_cpuid = old->cpu_id;
|
|
if (((clone_flags & CLONE_VM) && !(clone_flags & CLONE_THREAD)) ||
|
|
(!(clone_flags & CLONE_VM) && (clone_flags & CLONE_THREAD))) {
|
|
kprintf("clone(): ERROR: CLONE_VM and CLONE_THREAD should be set together\n");
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (termsig < 0 || _NSIG < termsig) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if((clone_flags & CLONE_SIGHAND) &&
|
|
!(clone_flags & CLONE_VM)){
|
|
return -EINVAL;
|
|
}
|
|
if((clone_flags & CLONE_THREAD) &&
|
|
!(clone_flags & CLONE_SIGHAND)){
|
|
return -EINVAL;
|
|
}
|
|
if((clone_flags & CLONE_FS) &&
|
|
(clone_flags & CLONE_NEWNS)){
|
|
return -EINVAL;
|
|
}
|
|
if((clone_flags & CLONE_NEWIPC) &&
|
|
(clone_flags & CLONE_SYSVSEM)){
|
|
return -EINVAL;
|
|
}
|
|
if((clone_flags & CLONE_NEWPID) &&
|
|
(clone_flags & CLONE_THREAD)){
|
|
return -EINVAL;
|
|
}
|
|
|
|
#if 0
|
|
if (!allow_oversubscribe && rusage.num_threads >= cpu_info->ncpus) {
|
|
kprintf("%s: ERROR: CPU oversubscription is not allowed. Specify -O option in mcreboot.sh to allow it.\n", __FUNCTION__);
|
|
return -EINVAL;
|
|
}
|
|
#endif
|
|
|
|
if (oldproc->coredump_barrier_count) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* N-th creation put the new on Linux CPU. It's turned off when zero is
|
|
set to uti_thread_rank. */
|
|
if (oldproc->uti_thread_rank) {
|
|
if (oldproc->clone_count + 1 == oldproc->uti_thread_rank) {
|
|
old->mod_clone = SPAWN_TO_REMOTE;
|
|
kprintf("%s: mod_clone is set to %d\n", __FUNCTION__, old->mod_clone);
|
|
} else {
|
|
old->mod_clone = SPAWN_TO_LOCAL;
|
|
kprintf("%s: mod_clone is set to %d\n", __FUNCTION__, old->mod_clone);
|
|
}
|
|
}
|
|
|
|
if (clone_pthread_start_routine) {
|
|
ihk_rwspinlock_read_lock_noirq(&old->vm->memory_range_lock);
|
|
range = lookup_process_memory_range(old->vm,
|
|
clone_pthread_start_routine,
|
|
clone_pthread_start_routine + 1);
|
|
ihk_rwspinlock_read_unlock_noirq(&old->vm->memory_range_lock);
|
|
|
|
if (range && range->memobj && range->memobj->path) {
|
|
if (!strstr(range->memobj->path, "omp.so") &&
|
|
!strstr(range->memobj->path, "libfj90")) {
|
|
helper_thread = 1;
|
|
}
|
|
dkprintf("clone(): %s thread from %s\n",
|
|
helper_thread ? "helper" : "compute",
|
|
range->memobj->path);
|
|
}
|
|
}
|
|
|
|
if (helper_thread) {
|
|
cpuid = ihk_mc_get_processor_id();
|
|
//cpuid = obtain_clone_cpuid(&oldproc->cpu_set, 1);
|
|
}
|
|
else {
|
|
cpuid = obtain_clone_cpuid(&oldproc->cpu_set,
|
|
(old->mod_clone == SPAWN_TO_REMOTE && oldproc->uti_use_last_cpu));
|
|
if (cpuid == -1) {
|
|
kprintf("do_fork,core not available\n");
|
|
return -EAGAIN;
|
|
}
|
|
}
|
|
|
|
new = clone_thread(old, curpc,
|
|
newsp ? newsp : cursp, clone_flags);
|
|
|
|
if (!new) {
|
|
err = -ENOMEM;
|
|
goto release_cpuid;
|
|
}
|
|
|
|
if (clone_pthread_start_routine &&
|
|
range && range->memobj && range->memobj->path) {
|
|
|
|
sprintf(new->pthread_routine, "0x%lx @ %s",
|
|
clone_pthread_start_routine,
|
|
range->memobj->path);
|
|
}
|
|
else {
|
|
sprintf(new->pthread_routine, "%s", "[unknown]");
|
|
}
|
|
|
|
newproc = new->proc;
|
|
|
|
cpu_set(cpuid, &new->vm->address_space->cpu_set,
|
|
&new->vm->address_space->cpu_set_lock);
|
|
|
|
if (clone_flags & CLONE_VM) {
|
|
int *tids = NULL;
|
|
int i;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
|
|
mcs_rwlock_writer_lock(&newproc->threads_lock, &lock);
|
|
/* Obtain mcexec TIDs if not known yet */
|
|
if (!newproc->nr_tids) {
|
|
tids = kmalloc(sizeof(int) * NR_TIDS, IHK_MC_AP_NOWAIT);
|
|
if (!tids) {
|
|
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
|
|
err = -ENOMEM;
|
|
goto destroy_thread;
|
|
}
|
|
|
|
newproc->tids = kmalloc(sizeof(struct mcexec_tid) *
|
|
NR_TIDS, IHK_MC_AP_NOWAIT);
|
|
if (!newproc->tids) {
|
|
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
|
|
kfree(tids);
|
|
err = -ENOMEM;
|
|
goto destroy_thread;
|
|
}
|
|
|
|
if ((err = settid(new, NR_TIDS, tids)) < 0) {
|
|
mcs_rwlock_writer_unlock(&newproc->threads_lock,
|
|
&lock);
|
|
kfree(tids);
|
|
goto release_ids;
|
|
}
|
|
|
|
for (i = 0; (i < NR_TIDS) && tids[i]; ++i) {
|
|
dkprintf("%s: tids[%d]: %d\n",
|
|
__func__, i, tids[i]);
|
|
newproc->tids[i].tid = tids[i];
|
|
newproc->tids[i].thread = NULL;
|
|
++newproc->nr_tids;
|
|
}
|
|
|
|
kfree(tids);
|
|
}
|
|
|
|
/* Find an unused TID */
|
|
new->tid = 0;
|
|
retry_tid:
|
|
for (i = 0; i < newproc->nr_tids; ++i) {
|
|
if (!newproc->tids[i].thread) {
|
|
if (cmpxchg(&newproc->tids[i].thread,
|
|
NULL, new) != NULL) {
|
|
goto retry_tid;
|
|
}
|
|
new->tid = newproc->tids[i].tid;
|
|
dkprintf("%s: tid %d assigned to %p\n", __FUNCTION__, new->tid, new);
|
|
break;
|
|
}
|
|
}
|
|
|
|
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
|
|
|
|
/* TODO: spawn more mcexec threads */
|
|
if (!new->tid) {
|
|
kprintf("%s: no more TIDs available\n", __func__);
|
|
for (i = 0; i < newproc->nr_tids; ++i) {
|
|
kprintf("%s: i=%d,tid=%d,thread=%p\n",
|
|
__func__, i, newproc->tids[i].tid,
|
|
newproc->tids[i].thread);
|
|
}
|
|
err = -ENOMEM;
|
|
goto release_ids;
|
|
}
|
|
}
|
|
/* fork() a new process on the host */
|
|
else {
|
|
request1.number = __NR_clone;
|
|
request1.args[0] = 0;
|
|
request1.args[1] = new->vm->region.user_start;
|
|
request1.args[2] = new->vm->region.user_end -
|
|
new->vm->region.user_start;
|
|
request1.args[3] =
|
|
virt_to_phys(new->vm->address_space->page_table);
|
|
if(clone_flags & CLONE_PARENT){
|
|
if(oldproc->ppid_parent->pid != 1)
|
|
request1.args[0] = clone_flags;
|
|
}
|
|
newproc->pid = do_syscall(&request1, ihk_mc_get_processor_id());
|
|
if (newproc->pid < 0) {
|
|
kprintf("ERROR: forking host process\n");
|
|
err = newproc->pid;
|
|
goto destroy_thread;
|
|
}
|
|
|
|
/* In a single threaded process TID equals to PID */
|
|
new->tid = newproc->pid;
|
|
new->vm->address_space->pids[0] = new->proc->pid;
|
|
|
|
dkprintf("fork(): new pid: %d\n", new->proc->pid);
|
|
if(oldproc->monitoring_event &&
|
|
oldproc->monitoring_event->attr.inherit){
|
|
newproc->monitoring_event = oldproc->monitoring_event;
|
|
}
|
|
}
|
|
|
|
if (clone_flags & CLONE_PARENT_SETTID) {
|
|
dkprintf("clone_flags & CLONE_PARENT_SETTID: 0x%lX\n",
|
|
parent_tidptr);
|
|
|
|
err = setint_user((int *)parent_tidptr, new->tid);
|
|
if (err) {
|
|
goto release_ids;
|
|
}
|
|
}
|
|
|
|
if (clone_flags & CLONE_CHILD_CLEARTID) {
|
|
dkprintf("clone_flags & CLONE_CHILD_CLEARTID: 0x%lX\n",
|
|
child_tidptr);
|
|
|
|
new->clear_child_tid = (int*)child_tidptr;
|
|
}
|
|
|
|
if (clone_flags & CLONE_CHILD_SETTID) {
|
|
unsigned long phys;
|
|
dkprintf("clone_flags & CLONE_CHILD_SETTID: 0x%lX\n",
|
|
child_tidptr);
|
|
|
|
if (ihk_mc_pt_virt_to_phys(new->vm->address_space->page_table,
|
|
(void *)child_tidptr, &phys)) {
|
|
kprintf("ERROR: looking up physical addr for child process\n");
|
|
err = -EFAULT;
|
|
goto release_ids;
|
|
}
|
|
|
|
*((int*)phys_to_virt(phys)) = new->tid;
|
|
}
|
|
|
|
if (clone_flags & CLONE_SETTLS) {
|
|
dkprintf("clone_flags & CLONE_SETTLS: 0x%lX\n",
|
|
tlsblock_base);
|
|
|
|
new->tlsblock_base = tlsblock_base;
|
|
}
|
|
else {
|
|
new->tlsblock_base = old->tlsblock_base;
|
|
}
|
|
|
|
new->parent_cpuid = parent_cpuid;
|
|
|
|
ihk_mc_syscall_ret(new->uctx) = 0;
|
|
|
|
new->status = PS_RUNNING;
|
|
|
|
/* Only the first do_fork() call creates a thread on a Linux CPU */
|
|
if (cmpxchg(&old->mod_clone, SPAWN_TO_REMOTE, SPAWN_TO_LOCAL) ==
|
|
SPAWN_TO_REMOTE) {
|
|
new->mod_clone = SPAWNING_TO_REMOTE;
|
|
if (old->mod_clone_arg) {
|
|
new->mod_clone_arg = kmalloc(sizeof(struct uti_attr),
|
|
IHK_MC_AP_NOWAIT);
|
|
if (!new->mod_clone_arg) {
|
|
kprintf("%s: error: allocating mod_clone_arg\n",
|
|
__func__);
|
|
err = -ENOMEM;
|
|
goto release_ids;
|
|
}
|
|
memcpy(new->mod_clone_arg, old->mod_clone_arg,
|
|
sizeof(struct uti_attr));
|
|
}
|
|
}
|
|
chain_thread(new);
|
|
if (!(clone_flags & CLONE_VM)) {
|
|
newproc->status = PS_RUNNING;
|
|
if(clone_flags & CLONE_PARENT){
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
struct process *parent;
|
|
struct mcs_rwlock_node parent_lock;
|
|
|
|
mcs_rwlock_reader_lock(&oldproc->update_lock, &lock);
|
|
parent = oldproc->ppid_parent;
|
|
mcs_rwlock_reader_lock_noirq(&parent->update_lock, &parent_lock);
|
|
if(parent->status == PS_EXITED || parent->status == PS_ZOMBIE){
|
|
mcs_rwlock_reader_unlock_noirq(&parent->update_lock, &parent_lock);
|
|
parent = cpu_local_var(resource_set)->pid1;
|
|
mcs_rwlock_reader_lock_noirq(&parent->update_lock, &parent_lock);
|
|
}
|
|
newproc->parent = parent;
|
|
newproc->ppid_parent = parent;
|
|
newproc->nowait = 1;
|
|
chain_process(newproc);
|
|
mcs_rwlock_reader_unlock_noirq(&parent->update_lock, &parent_lock);
|
|
mcs_rwlock_reader_unlock(&oldproc->update_lock, &lock);
|
|
}
|
|
else
|
|
chain_process(newproc);
|
|
}
|
|
|
|
if (old->ptrace) {
|
|
ptrace_event = ptrace_check_clone_event(old, clone_flags);
|
|
if (ptrace_event) {
|
|
ptrace_report_clone(old, new, ptrace_event);
|
|
}
|
|
}
|
|
|
|
dkprintf("clone: kicking scheduler!,cpuid=%d pid=%d tid %d -> tid=%d\n",
|
|
cpuid, newproc->pid,
|
|
old->tid,
|
|
new->tid);
|
|
|
|
if (!(clone_flags & CLONE_VM)) {
|
|
request1.number = __NR_clone;
|
|
request1.args[0] = 1;
|
|
request1.args[1] = new->tid;
|
|
err = do_syscall(&request1, ihk_mc_get_processor_id());
|
|
if (err) {
|
|
goto free_mod_clone_arg;
|
|
}
|
|
}
|
|
else if (termsig && termsig != SIGCHLD) {
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
|
|
mcs_rwlock_writer_lock(&oldproc->threads_lock, &lock);
|
|
new->termsig = termsig;
|
|
new->report_proc = oldproc;
|
|
list_add_tail(&new->report_siblings_list,
|
|
&oldproc->report_threads_list);
|
|
mcs_rwlock_writer_unlock(&oldproc->threads_lock, &lock);
|
|
hold_thread(new);
|
|
}
|
|
|
|
runq_add_thread(new, cpuid);
|
|
|
|
if (ptrace_event) {
|
|
schedule();
|
|
}
|
|
|
|
return new->tid;
|
|
|
|
free_mod_clone_arg:
|
|
kfree(new->mod_clone_arg);
|
|
new->mod_clone_arg = NULL;
|
|
|
|
ihk_atomic_dec(&new->vm->refcount);
|
|
|
|
release_ids:
|
|
if (clone_flags & CLONE_VM) {
|
|
kfree(newproc->tids);
|
|
newproc->tids = NULL;
|
|
} else {
|
|
request1.number = __NR_kill;
|
|
request1.args[0] = newproc->pid;
|
|
request1.args[1] = SIGKILL;
|
|
do_syscall(&request1, ihk_mc_get_processor_id());
|
|
}
|
|
|
|
destroy_thread:
|
|
if (!(clone_flags & CLONE_VM)) {
|
|
/* in case of fork, destroy struct process */
|
|
ihk_atomic_set(&new->proc->refcount, 1);
|
|
kfree(newproc->saved_cmdline);
|
|
newproc->saved_cmdline = NULL;
|
|
}
|
|
ihk_atomic_set(&new->refcount, 1);
|
|
release_thread(new);
|
|
|
|
release_cpuid:
|
|
release_cpuid(cpuid);
|
|
return err;
|
|
}
|
|
|
|
SYSCALL_DECLARE(set_tid_address)
|
|
{
|
|
cpu_local_var(current)->clear_child_tid =
|
|
(int*)ihk_mc_syscall_arg0(ctx);
|
|
|
|
return cpu_local_var(current)->proc->pid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(times)
|
|
{
|
|
struct tms {
|
|
unsigned long tms_utime;
|
|
unsigned long tms_stime;
|
|
unsigned long tms_cutime;
|
|
unsigned long tms_cstime;
|
|
};
|
|
struct tms mytms;
|
|
struct tms *buf = (struct tms *)ihk_mc_syscall_arg0(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct timespec ats;
|
|
|
|
tsc_to_ts(thread->user_tsc, &ats);
|
|
mytms.tms_utime = timespec_to_jiffy(&ats);
|
|
tsc_to_ts(thread->system_tsc, &ats);
|
|
mytms.tms_stime = timespec_to_jiffy(&ats);
|
|
ats.tv_sec = proc->utime.tv_sec;
|
|
ats.tv_nsec = proc->utime.tv_nsec;
|
|
ts_add(&ats, &proc->utime_children);
|
|
mytms.tms_cutime = timespec_to_jiffy(&ats);
|
|
ats.tv_sec = proc->stime.tv_sec;
|
|
ats.tv_nsec = proc->stime.tv_nsec;
|
|
ts_add(&ats, &proc->stime_children);
|
|
mytms.tms_cstime = timespec_to_jiffy(&ats);
|
|
if(copy_to_user(buf, &mytms, sizeof mytms))
|
|
return -EFAULT;
|
|
if(gettime_local_support){
|
|
calculate_time_from_tsc(&ats);
|
|
}
|
|
else{
|
|
ats.tv_sec = 0;
|
|
ats.tv_nsec = 0;
|
|
}
|
|
|
|
return timespec_to_jiffy(&ats);
|
|
}
|
|
|
|
SYSCALL_DECLARE(kill)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
int sig = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct siginfo info;
|
|
int error;
|
|
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = sig;
|
|
info.si_code = SI_USER;
|
|
info._sifields._kill.si_pid = thread->proc->pid;
|
|
|
|
dkprintf("sys_kill,enter,pid=%d,sig=%d\n", pid, sig);
|
|
error = do_kill(thread, pid, -1, sig, &info, 0);
|
|
dkprintf("sys_kill,returning,pid=%d,sig=%d,error=%d\n", pid, sig, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(tgkill)
|
|
{
|
|
int tgid = ihk_mc_syscall_arg0(ctx);
|
|
int tid = ihk_mc_syscall_arg1(ctx);
|
|
int sig = ihk_mc_syscall_arg2(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct siginfo info;
|
|
|
|
if (tgid <= 0 || tid <= 0) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = sig;
|
|
info.si_code = SI_TKILL;
|
|
info._sifields._kill.si_pid = thread->proc->pid;
|
|
|
|
return do_kill(thread, tgid, tid, sig, &info, 0);
|
|
}
|
|
|
|
SYSCALL_DECLARE(tkill)
|
|
{
|
|
int tid = ihk_mc_syscall_arg0(ctx);
|
|
int sig = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct siginfo info;
|
|
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = sig;
|
|
info.si_code = SI_TKILL;
|
|
info._sifields._kill.si_pid = thread->proc->pid;
|
|
|
|
if(tid <= 0)
|
|
return -EINVAL;
|
|
|
|
return do_kill(thread, -1, tid, sig, &info, 0);
|
|
}
|
|
|
|
int *
|
|
getcred(int *_buf)
|
|
{
|
|
int *buf;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
unsigned long phys;
|
|
|
|
if ((((unsigned long)_buf) ^ ((unsigned long)(_buf + 8))) & PAGE_MASK)
|
|
buf = _buf + 8;
|
|
else
|
|
buf = _buf;
|
|
phys = virt_to_phys(buf);
|
|
request.number = __NR_setfsuid;
|
|
request.args[0] = phys;
|
|
request.args[1] = 1;
|
|
do_syscall(&request, ihk_mc_get_processor_id());
|
|
|
|
return buf;
|
|
}
|
|
|
|
void
|
|
do_setresuid()
|
|
{
|
|
int _buf[16];
|
|
int *buf;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
buf = getcred(_buf);
|
|
|
|
proc->ruid = buf[0];
|
|
proc->euid = buf[1];
|
|
proc->suid = buf[2];
|
|
proc->fsuid = buf[3];
|
|
}
|
|
|
|
void
|
|
do_setresgid()
|
|
{
|
|
int _buf[16];
|
|
int *buf;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
buf = getcred(_buf);
|
|
|
|
proc->rgid = buf[4];
|
|
proc->egid = buf[5];
|
|
proc->sgid = buf[6];
|
|
proc->fsgid = buf[7];
|
|
}
|
|
|
|
SYSCALL_DECLARE(setresuid)
|
|
{
|
|
int rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setresuid, ctx);
|
|
if(rc == 0){
|
|
do_setresuid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setreuid)
|
|
{
|
|
int rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setreuid, ctx);
|
|
if(rc == 0){
|
|
do_setresuid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setuid)
|
|
{
|
|
long rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setuid, ctx);
|
|
if(rc == 0){
|
|
do_setresuid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setfsuid)
|
|
{
|
|
int fsuid = (int)ihk_mc_syscall_arg0(ctx);;
|
|
unsigned long newfsuid;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
request.number = __NR_setfsuid;
|
|
request.args[0] = fsuid;
|
|
request.args[1] = 0;
|
|
newfsuid = do_syscall(&request, ihk_mc_get_processor_id());
|
|
do_setresuid();
|
|
return newfsuid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setresgid)
|
|
{
|
|
int rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setresgid, ctx);
|
|
if(rc == 0){
|
|
do_setresgid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setregid)
|
|
{
|
|
int rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setregid, ctx);
|
|
if(rc == 0){
|
|
do_setresgid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setgid)
|
|
{
|
|
long rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_setgid, ctx);
|
|
if(rc == 0){
|
|
do_setresgid();
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setfsgid)
|
|
{
|
|
int fsgid = (int)ihk_mc_syscall_arg0(ctx);;
|
|
unsigned long newfsgid;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
request.number = __NR_setfsgid;
|
|
request.args[0] = fsgid;
|
|
newfsgid = do_syscall(&request, ihk_mc_get_processor_id());
|
|
do_setresgid();
|
|
return newfsgid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getuid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
return proc->ruid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(geteuid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
return proc->euid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getresuid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
int *ruid = (int *)ihk_mc_syscall_arg0(ctx);
|
|
int *euid = (int *)ihk_mc_syscall_arg1(ctx);
|
|
int *suid = (int *)ihk_mc_syscall_arg2(ctx);
|
|
|
|
if(copy_to_user(ruid, &proc->ruid, sizeof(int)))
|
|
return -EFAULT;
|
|
if(copy_to_user(euid, &proc->euid, sizeof(int)))
|
|
return -EFAULT;
|
|
if(copy_to_user(suid, &proc->suid, sizeof(int)))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getgid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
return proc->rgid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getegid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
return proc->egid;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getresgid)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
int *rgid = (int *)ihk_mc_syscall_arg0(ctx);
|
|
int *egid = (int *)ihk_mc_syscall_arg1(ctx);
|
|
int *sgid = (int *)ihk_mc_syscall_arg2(ctx);
|
|
|
|
if(copy_to_user(rgid, &proc->rgid, sizeof(int)))
|
|
return -EFAULT;
|
|
if(copy_to_user(egid, &proc->egid, sizeof(int)))
|
|
return -EFAULT;
|
|
if(copy_to_user(sgid, &proc->sgid, sizeof(int)))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setpgid)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
int pgid = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
long rc;
|
|
|
|
if(pid == 0)
|
|
pid = proc->pid;
|
|
if(pgid == 0)
|
|
pgid = pid;
|
|
|
|
if(proc->pid != pid){
|
|
proc = find_process(pid, &lock);
|
|
if(proc){
|
|
if(proc->execed){
|
|
process_unlock(proc, &lock);
|
|
return -EACCES;
|
|
}
|
|
process_unlock(proc, &lock);
|
|
}
|
|
else
|
|
return -ESRCH;
|
|
}
|
|
|
|
rc = syscall_generic_forwarding(__NR_setpgid, ctx);
|
|
if(rc == 0){
|
|
proc = find_process(pid, &lock);
|
|
if(proc){
|
|
proc->pgid = pgid;
|
|
process_unlock(proc, &lock);
|
|
}
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
/* Ignore the registration by start_thread() (in pthread_create.c)
|
|
because McKernel doesn't unlock mutex-es held by the thread which has been killed. */
|
|
#define ROBUST_LIST_HEAD_SIZE 24
|
|
SYSCALL_DECLARE(set_robust_list)
|
|
{
|
|
size_t len = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
|
|
if (len != ROBUST_LIST_HEAD_SIZE) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int
|
|
do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct k_sigaction *k;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
ihk_mc_user_context_t ctx0;
|
|
|
|
if (!valid_signal(sig) || sig < 1) {
|
|
return -EINVAL;
|
|
}
|
|
if (act && (sig == SIGKILL || sig == SIGSTOP)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node);
|
|
k = thread->sigcommon->action + sig - 1;
|
|
if(oact)
|
|
memcpy(oact, k, sizeof(struct k_sigaction));
|
|
if(act)
|
|
memcpy(k, act, sizeof(struct k_sigaction));
|
|
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
|
|
|
|
if(act){
|
|
ihk_mc_syscall_arg0(&ctx0) = sig;
|
|
ihk_mc_syscall_arg1(&ctx0) = (unsigned long)act->sa.sa_handler;
|
|
ihk_mc_syscall_arg2(&ctx0) = act->sa.sa_flags;
|
|
syscall_generic_forwarding(__NR_rt_sigaction, &ctx0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(read)
|
|
{
|
|
int fd = ihk_mc_syscall_arg0(ctx);
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *fdp;
|
|
long irqstate;
|
|
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
for(fdp = proc->mckfd; fdp; fdp = fdp->next)
|
|
if(fdp->fd == fd)
|
|
break;
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
|
|
if(fdp && fdp->read_cb){
|
|
//kprintf("read: found system fd %d\n", fd);
|
|
rc = fdp->read_cb(fdp, ctx);
|
|
}
|
|
else{
|
|
rc = syscall_generic_forwarding(__NR_read, ctx);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(ioctl)
|
|
{
|
|
int fd = ihk_mc_syscall_arg0(ctx);
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *fdp;
|
|
long irqstate;
|
|
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
for(fdp = proc->mckfd; fdp; fdp = fdp->next)
|
|
if(fdp->fd == fd)
|
|
break;
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
|
|
#ifdef ENABLE_TOFU
|
|
/* Tofu? */
|
|
if (proc->enable_tofu &&
|
|
fd < MAX_FD_PDE && thread->proc->fd_pde_data[fd]) {
|
|
extern long tof_utofu_unlocked_ioctl(int fd,
|
|
unsigned int cmd, unsigned long arg);
|
|
|
|
rc = tof_utofu_unlocked_ioctl(fd,
|
|
ihk_mc_syscall_arg1(ctx),
|
|
ihk_mc_syscall_arg2(ctx));
|
|
|
|
/* Do we need to offload? */
|
|
if (rc != -ENOTSUPP)
|
|
return rc;
|
|
}
|
|
#endif
|
|
|
|
if(fdp && fdp->ioctl_cb){
|
|
//kprintf("ioctl: found system fd %d\n", fd);
|
|
rc = fdp->ioctl_cb(fdp, ctx);
|
|
}
|
|
else{
|
|
rc = syscall_generic_forwarding(__NR_ioctl, ctx);
|
|
}
|
|
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(open)
|
|
{
|
|
const char *_pathname = (const char *)ihk_mc_syscall_arg0(ctx);
|
|
int flags = (int)ihk_mc_syscall_arg1(ctx);
|
|
int len;
|
|
char *pathname;
|
|
long rc;
|
|
|
|
len = strlen_user(_pathname);
|
|
if (len < 0)
|
|
return len;
|
|
len++;
|
|
|
|
pathname = kmalloc(len, IHK_MC_AP_NOWAIT);
|
|
if (!pathname) {
|
|
dkprintf("%s: error allocating pathname\n", __func__);
|
|
return -ENOMEM;
|
|
}
|
|
if (copy_from_user(pathname, _pathname, len)) {
|
|
dkprintf("%s: error: copy_from_user pathname\n", __func__);
|
|
rc = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
#ifdef ENABLE_TOFU
|
|
cpu_local_var(current)->fd_path_in_open = pathname;
|
|
#endif
|
|
|
|
dkprintf("open(): pathname=%s\n", pathname);
|
|
if (!strncmp(pathname, XPMEM_DEV_PATH, len)) {
|
|
rc = xpmem_open(pathname, flags, ctx);
|
|
} else {
|
|
rc = syscall_generic_forwarding(__NR_open, ctx);
|
|
}
|
|
|
|
#ifdef ENABLE_TOFU
|
|
cpu_local_var(current)->fd_path_in_open = NULL;
|
|
#endif
|
|
|
|
out:
|
|
#ifdef ENABLE_TOFU
|
|
if (rc > 0 && rc < MAX_FD_PDE) {
|
|
cpu_local_var(current)->proc->fd_path[rc] = pathname;
|
|
}
|
|
else {
|
|
kfree(pathname);
|
|
}
|
|
#else
|
|
kfree(pathname);
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(openat)
|
|
{
|
|
const char *_pathname = (const char *)ihk_mc_syscall_arg1(ctx);
|
|
int flags = (int)ihk_mc_syscall_arg2(ctx);
|
|
char *pathname;
|
|
int len;
|
|
long rc;
|
|
|
|
len = strlen_user(_pathname);
|
|
if (len < 0)
|
|
return len;
|
|
len++;
|
|
|
|
pathname = kmalloc(len, IHK_MC_AP_NOWAIT);
|
|
if (!pathname) {
|
|
dkprintf("%s: error allocating pathname\n", __func__);
|
|
return -ENOMEM;
|
|
}
|
|
if (copy_from_user(pathname, _pathname, len)) {
|
|
dkprintf("%s: error: copy_from_user pathname\n", __func__);
|
|
rc = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
#ifdef ENABLE_TOFU
|
|
cpu_local_var(current)->fd_path_in_open = pathname;
|
|
#endif
|
|
|
|
dkprintf("openat(): pathname=%s\n", pathname);
|
|
if (!strncmp(pathname, XPMEM_DEV_PATH, len)) {
|
|
rc = xpmem_openat(pathname, flags, ctx);
|
|
} else {
|
|
rc = syscall_generic_forwarding(__NR_openat, ctx);
|
|
}
|
|
|
|
#ifdef ENABLE_TOFU
|
|
cpu_local_var(current)->fd_path_in_open = NULL;
|
|
#endif
|
|
|
|
out:
|
|
#ifdef ENABLE_TOFU
|
|
if (rc > 0 && rc < MAX_FD_PDE) {
|
|
cpu_local_var(current)->proc->fd_path[rc] = pathname;
|
|
}
|
|
else {
|
|
kfree(pathname);
|
|
}
|
|
#else
|
|
kfree(pathname);
|
|
#endif
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(execveat)
|
|
{
|
|
int dirfd = (int)ihk_mc_syscall_arg0(ctx);
|
|
const char *filename = (const char *)ihk_mc_syscall_arg1(ctx);
|
|
int flags = (int)ihk_mc_syscall_arg4(ctx);
|
|
long ret;
|
|
|
|
/* validate flags */
|
|
if ((flags & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (filename[0] == '/' || dirfd == AT_FDCWD) {
|
|
/* behave same as execve */
|
|
goto exec;
|
|
}
|
|
|
|
/* validate dirfd */
|
|
if (dirfd < 0 && dirfd != AT_FDCWD) {
|
|
ret = -EBADF;
|
|
goto out;
|
|
}
|
|
|
|
exec:
|
|
ret = do_execveat(ctx, dirfd, filename,
|
|
(char **)ihk_mc_syscall_arg2(ctx),
|
|
(char **)ihk_mc_syscall_arg3(ctx), flags);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(close)
|
|
{
|
|
int fd = ihk_mc_syscall_arg0(ctx);
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *fdp;
|
|
struct mckfd *fdq;
|
|
long irqstate;
|
|
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
|
|
#ifdef ENABLE_TOFU
|
|
/* Clear path and PDE data */
|
|
if (thread->proc->enable_tofu &&
|
|
fd >= 0 && fd < MAX_FD_PDE) {
|
|
/* Tofu? */
|
|
if (thread->proc->fd_pde_data[fd]) {
|
|
extern void tof_utofu_release_fd(struct process *proc, int fd);
|
|
|
|
dkprintf("%s: -> tof_utofu_release_fd() @ fd: %d (%s)\n",
|
|
__func__, fd, thread->proc->fd_path[fd]);
|
|
tof_utofu_release_fd(thread->proc, fd);
|
|
thread->proc->fd_pde_data[fd] = NULL;
|
|
}
|
|
|
|
if (thread->proc->fd_path[fd]) {
|
|
dkprintf("%s: %d -> %s\n", __func__, fd, thread->proc->fd_path[fd]);
|
|
kfree(thread->proc->fd_path[fd]);
|
|
thread->proc->fd_path[fd] = NULL;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
for(fdp = proc->mckfd, fdq = NULL; fdp; fdq = fdp, fdp = fdp->next)
|
|
if(fdp->fd == fd)
|
|
break;
|
|
|
|
if(fdp){
|
|
//kprintf("close: found system fd %d pid=%d\n", fd, proc->pid);
|
|
if(fdq)
|
|
fdq->next = fdp->next;
|
|
else
|
|
proc->mckfd = fdp->next;
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
if(fdp->close_cb)
|
|
fdp->close_cb(fdp, ctx);
|
|
kfree(fdp);
|
|
rc = syscall_generic_forwarding(__NR_close, ctx);
|
|
}
|
|
else{
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
rc = syscall_generic_forwarding(__NR_close, ctx);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(fcntl)
|
|
{
|
|
int fd = ihk_mc_syscall_arg0(ctx);
|
|
// int cmd = ihk_mc_syscall_arg1(ctx);
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *fdp;
|
|
long irqstate;
|
|
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
for(fdp = proc->mckfd; fdp; fdp = fdp->next)
|
|
if(fdp->fd == fd)
|
|
break;
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
|
|
if(fdp && fdp->fcntl_cb){
|
|
rc = fdp->fcntl_cb(fdp, ctx);
|
|
}
|
|
else{
|
|
rc = syscall_generic_forwarding(__NR_fcntl, ctx);
|
|
}
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(epoll_pwait)
|
|
{
|
|
long rc;
|
|
sigset_t *set = (sigset_t *)ihk_mc_syscall_arg4(ctx);
|
|
__sigset_t oldset;
|
|
__sigset_t wset;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
oldset = thread->sigmask.__val[0];
|
|
if (set) {
|
|
if (copy_from_user(&wset, set->__val, sizeof(wset))) {
|
|
return -EFAULT;
|
|
}
|
|
thread->sigmask.__val[0] = wset;
|
|
}
|
|
rc = syscall_generic_forwarding(__NR_epoll_pwait, ctx);
|
|
thread->sigmask.__val[0] = oldset;
|
|
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(ppoll)
|
|
{
|
|
long rc;
|
|
sigset_t *set = (sigset_t *)ihk_mc_syscall_arg3(ctx);
|
|
__sigset_t oldset;
|
|
__sigset_t wset;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
oldset = thread->sigmask.__val[0];
|
|
if (set) {
|
|
if (copy_from_user(&wset, set->__val, sizeof(wset))) {
|
|
return -EFAULT;
|
|
}
|
|
thread->sigmask.__val[0] = wset;
|
|
}
|
|
rc = syscall_generic_forwarding(__NR_ppoll, ctx);
|
|
thread->sigmask.__val[0] = oldset;
|
|
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(pselect6)
|
|
{
|
|
long rc;
|
|
sigset_t **_set = (sigset_t **)ihk_mc_syscall_arg5(ctx);
|
|
sigset_t *set = NULL;
|
|
__sigset_t oldset;
|
|
__sigset_t wset;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
if (_set) {
|
|
if (copy_from_user(&set, _set, sizeof(void *))) {
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
oldset = thread->sigmask.__val[0];
|
|
if (set) {
|
|
if (copy_from_user(&wset, set->__val, sizeof(wset))) {
|
|
return -EFAULT;
|
|
}
|
|
thread->sigmask.__val[0] = wset;
|
|
}
|
|
rc = syscall_generic_forwarding(__NR_pselect6, ctx);
|
|
thread->sigmask.__val[0] = oldset;
|
|
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(rt_sigprocmask)
|
|
{
|
|
int how = ihk_mc_syscall_arg0(ctx);
|
|
const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg1(ctx);
|
|
sigset_t *oldset = (sigset_t *)ihk_mc_syscall_arg2(ctx);
|
|
size_t sigsetsize = (size_t)ihk_mc_syscall_arg3(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
__sigset_t wsig;
|
|
ihk_mc_user_context_t ctx0;
|
|
|
|
if(sigsetsize != sizeof(sigset_t))
|
|
return -EINVAL;
|
|
|
|
if(set &&
|
|
how != SIG_BLOCK &&
|
|
how != SIG_UNBLOCK &&
|
|
how != SIG_SETMASK)
|
|
return -EINVAL;
|
|
|
|
if(oldset){
|
|
wsig = thread->sigmask.__val[0];
|
|
if(copy_to_user(oldset->__val, &wsig, sizeof wsig))
|
|
goto fault;
|
|
}
|
|
if(set){
|
|
if(copy_from_user(&wsig, set->__val, sizeof wsig))
|
|
goto fault;
|
|
switch(how){
|
|
case SIG_BLOCK:
|
|
thread->sigmask.__val[0] |= wsig;
|
|
break;
|
|
case SIG_UNBLOCK:
|
|
thread->sigmask.__val[0] &= ~wsig;
|
|
break;
|
|
case SIG_SETMASK:
|
|
thread->sigmask.__val[0] = wsig;
|
|
break;
|
|
}
|
|
}
|
|
thread->sigmask.__val[0] &= ~__sigmask(SIGKILL);
|
|
thread->sigmask.__val[0] &= ~__sigmask(SIGSTOP);
|
|
wsig = thread->sigmask.__val[0];
|
|
|
|
ihk_mc_syscall_arg0(&ctx0) = wsig;
|
|
syscall_generic_forwarding(__NR_rt_sigprocmask, &ctx0);
|
|
return 0;
|
|
fault:
|
|
return -EFAULT;
|
|
}
|
|
|
|
SYSCALL_DECLARE(rt_sigpending)
|
|
{
|
|
struct sig_pending *pending;
|
|
struct list_head *head;
|
|
mcs_rwlock_lock_t *lock;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
__sigset_t w = 0;
|
|
struct thread *thread = cpu_local_var(current);
|
|
sigset_t *set = (sigset_t *)ihk_mc_syscall_arg0(ctx);
|
|
size_t sigsetsize = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
|
|
if (sigsetsize > sizeof(sigset_t))
|
|
return -EINVAL;
|
|
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
w |= pending->sigmask.__val[0];
|
|
}
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
w |= pending->sigmask.__val[0];
|
|
}
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
if(copy_to_user(set->__val, &w, sizeof w))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(signalfd)
|
|
{
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
SYSCALL_DECLARE(signalfd4)
|
|
{
|
|
int fd = ihk_mc_syscall_arg0(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *sfd;
|
|
long irqstate;
|
|
sigset_t *maskp = (sigset_t *)ihk_mc_syscall_arg1(ctx);;
|
|
__sigset_t mask;
|
|
size_t sigsetsize = (size_t)ihk_mc_syscall_arg2(ctx);
|
|
int flags = ihk_mc_syscall_arg3(ctx);
|
|
|
|
if(sigsetsize != sizeof(sigset_t))
|
|
return -EINVAL;
|
|
if(copy_from_user(&mask, maskp, sizeof mask))
|
|
return -EFAULT;
|
|
if(flags & ~(SFD_NONBLOCK | SFD_CLOEXEC))
|
|
return -EINVAL;
|
|
|
|
if(fd == -1){
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
|
|
request.number = __NR_signalfd4;
|
|
request.args[0] = 0;
|
|
request.args[1] = flags;
|
|
fd = do_syscall(&request, ihk_mc_get_processor_id());
|
|
if(fd < 0){
|
|
return fd;
|
|
}
|
|
sfd = kmalloc(sizeof(struct mckfd), IHK_MC_AP_NOWAIT);
|
|
if(!sfd)
|
|
return -ENOMEM;
|
|
memset(sfd, '\0', sizeof(struct mckfd));
|
|
sfd->fd = fd;
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
sfd->next = proc->mckfd;
|
|
proc->mckfd = sfd;
|
|
}
|
|
else{
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
for(sfd = proc->mckfd; sfd; sfd = sfd->next)
|
|
if(sfd->fd == fd)
|
|
break;
|
|
if(!sfd){
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
memcpy(&sfd->data, &mask, sizeof mask);
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
return sfd->fd;
|
|
}
|
|
|
|
#ifdef ENABLE_PERF
|
|
int perf_counter_set(struct mc_perf_event *event)
|
|
{
|
|
int ret = 0;
|
|
struct perf_event_attr *attr = &event->attr;
|
|
int mode = 0x00;
|
|
|
|
if(!attr->exclude_kernel) {
|
|
mode |= PERFCTR_KERNEL_MODE;
|
|
}
|
|
if(!attr->exclude_user) {
|
|
mode |= PERFCTR_USER_MODE;
|
|
}
|
|
|
|
if (event->extra_reg.reg) {
|
|
if (ihk_mc_perfctr_set_extra(event)) {
|
|
ret = -1;
|
|
goto out;
|
|
}
|
|
}
|
|
ret = ihk_mc_perfctr_init_raw(event->counter_id,
|
|
event->hw_config, mode);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
unsigned long perf_event_read_value(struct mc_perf_event *event)
|
|
{
|
|
unsigned long rtn_count = 0;
|
|
unsigned long pmc_count = 0;
|
|
struct thread *thread = cpu_local_var(current);
|
|
unsigned long cur_user_tsc, cur_system_tsc;
|
|
|
|
if (event->stopped_user_tsc) {
|
|
cur_user_tsc = event->stopped_user_tsc;
|
|
}
|
|
else {
|
|
cur_user_tsc = thread->user_tsc;
|
|
}
|
|
|
|
if (event->stopped_system_tsc) {
|
|
cur_system_tsc = event->stopped_system_tsc;
|
|
}
|
|
else {
|
|
cur_system_tsc = thread->system_tsc;
|
|
}
|
|
|
|
/* -- For use_invariant_tsc --
|
|
* Add sum of counts in the previous start-stop periods to
|
|
* the current count in the start-read period
|
|
*/
|
|
if(event->pid == 0) {
|
|
if (event->use_invariant_tsc) {
|
|
if (!event->attr.exclude_user) {
|
|
pmc_count += cur_user_tsc -
|
|
event->base_user_tsc +
|
|
event->user_accum_count;
|
|
}
|
|
if (!event->attr.exclude_kernel) {
|
|
/* Add sum of counts in the previous
|
|
* start-stop periods to the current count
|
|
* in the start-read period
|
|
*/
|
|
pmc_count += cur_system_tsc -
|
|
event->base_system_tsc +
|
|
event->system_accum_count;
|
|
}
|
|
}
|
|
else {
|
|
ihk_mc_event_update(event);
|
|
}
|
|
}
|
|
|
|
rtn_count += ihk_atomic64_read(&event->count) + pmc_count;
|
|
|
|
if(event->attr.inherit)
|
|
rtn_count += event->child_count_total;
|
|
|
|
return rtn_count;
|
|
}
|
|
|
|
static int
|
|
perf_event_read_group(struct mc_perf_event *event, unsigned long read_format, char *buf)
|
|
{
|
|
struct mc_perf_event *leader = event->group_leader, *sub;
|
|
int n = 0, size = 0, ret;
|
|
unsigned long count;
|
|
long long values[5];
|
|
|
|
count = perf_event_read_value(leader);
|
|
|
|
values[n++] = 1 + leader->nr_siblings;
|
|
values[n++] = count;
|
|
|
|
size = n * sizeof(long long);
|
|
|
|
if (copy_to_user(buf, values, size))
|
|
return -EFAULT;
|
|
|
|
ret = size;
|
|
|
|
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
|
|
n = 0;
|
|
values[n++] = perf_event_read_value(sub);
|
|
|
|
size = n * sizeof(long long);
|
|
|
|
if (copy_to_user(buf + ret, values, size)) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
ret += size;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
static int
|
|
perf_event_read_one(struct mc_perf_event *event, unsigned long read_format, char *buf)
|
|
{
|
|
unsigned long values[4];
|
|
int n = 0;
|
|
int size = 0;
|
|
|
|
values[n++] = perf_event_read_value(event);
|
|
|
|
size = n * sizeof(unsigned long);
|
|
|
|
if (copy_to_user(buf, values, size))
|
|
return -EFAULT;
|
|
|
|
return size;
|
|
}
|
|
|
|
static long
|
|
perf_read(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
|
|
{
|
|
char *buf = (char *)ihk_mc_syscall_arg1(ctx);
|
|
struct mc_perf_event *event = (struct mc_perf_event*)sfd->data;
|
|
unsigned long read_format = event->attr.read_format;
|
|
long ret;
|
|
|
|
if (read_format & PERF_FORMAT_GROUP) {
|
|
ret = perf_event_read_group(event, read_format, buf);
|
|
} else {
|
|
ret = perf_event_read_one(event, read_format, buf);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void perf_start(struct mc_perf_event *event)
|
|
{
|
|
int counter_id;
|
|
unsigned long counter_mask = 0;
|
|
struct mc_perf_event *leader = event->group_leader, *sub;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
/* -- For use_invariant_tsc --
|
|
* Record sum of counts the previous start-stop periods
|
|
* into accum_count,
|
|
* because only the count at the last start is recorded
|
|
*/
|
|
counter_id = leader->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id) &&
|
|
leader->state == PERF_EVENT_STATE_INACTIVE) {
|
|
if (leader->use_invariant_tsc) {
|
|
if (leader->stopped_user_tsc) {
|
|
leader->user_accum_count +=
|
|
leader->stopped_user_tsc -
|
|
leader->base_user_tsc;
|
|
leader->stopped_user_tsc = 0;
|
|
}
|
|
leader->base_user_tsc = thread->user_tsc;
|
|
|
|
if (leader->stopped_system_tsc) {
|
|
leader->system_accum_count +=
|
|
leader->stopped_system_tsc -
|
|
leader->base_system_tsc;
|
|
leader->stopped_system_tsc = 0;
|
|
}
|
|
leader->base_system_tsc = thread->system_tsc;
|
|
}
|
|
else {
|
|
ihk_mc_event_set_period(leader);
|
|
perf_counter_set(leader);
|
|
counter_mask |= 1UL << counter_id;
|
|
}
|
|
|
|
leader->state = PERF_EVENT_STATE_ACTIVE;
|
|
}
|
|
|
|
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
|
|
counter_id = sub->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id) &&
|
|
sub->state == PERF_EVENT_STATE_INACTIVE) {
|
|
if (sub->use_invariant_tsc) {
|
|
if (sub->stopped_user_tsc) {
|
|
sub->user_accum_count +=
|
|
sub->stopped_user_tsc -
|
|
sub->base_user_tsc;
|
|
sub->stopped_user_tsc = 0;
|
|
}
|
|
sub->base_user_tsc = thread->user_tsc;
|
|
|
|
if (sub->stopped_system_tsc) {
|
|
sub->system_accum_count +=
|
|
sub->stopped_system_tsc -
|
|
sub->base_system_tsc;
|
|
sub->stopped_system_tsc = 0;
|
|
}
|
|
sub->base_system_tsc = thread->system_tsc;
|
|
}
|
|
else {
|
|
ihk_mc_event_set_period(sub);
|
|
perf_counter_set(sub);
|
|
counter_mask |= 1UL << counter_id;
|
|
}
|
|
|
|
sub->state = PERF_EVENT_STATE_ACTIVE;
|
|
}
|
|
}
|
|
|
|
if (counter_mask) {
|
|
ihk_mc_perfctr_start(counter_mask);
|
|
}
|
|
thread->proc->perf_status = PP_COUNT;
|
|
}
|
|
|
|
void
|
|
perf_reset(struct mc_perf_event *event)
|
|
{
|
|
int counter_id;
|
|
struct mc_perf_event *leader = event->group_leader, *sub;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
counter_id = leader->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id)) {
|
|
/* Let perf_event_read_value return zero when stopped */
|
|
if (leader->use_invariant_tsc) {
|
|
if (leader->stopped_user_tsc) {
|
|
leader->base_user_tsc =
|
|
leader->stopped_user_tsc;
|
|
}
|
|
else {
|
|
leader->base_user_tsc = thread->user_tsc;
|
|
}
|
|
leader->user_accum_count = 0;
|
|
|
|
if (leader->stopped_system_tsc) {
|
|
leader->base_system_tsc =
|
|
leader->stopped_system_tsc;
|
|
}
|
|
else {
|
|
leader->base_system_tsc = thread->system_tsc;
|
|
}
|
|
leader->system_accum_count = 0;
|
|
}
|
|
else {
|
|
perf_event_read_value(leader);
|
|
ihk_atomic64_set(&leader->count, 0);
|
|
}
|
|
}
|
|
|
|
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
|
|
counter_id = sub->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id)) {
|
|
/* Let perf_event_read_value return zero when stopped */
|
|
if (sub->use_invariant_tsc) {
|
|
if (sub->stopped_user_tsc) {
|
|
sub->base_user_tsc =
|
|
sub->stopped_user_tsc;
|
|
}
|
|
else {
|
|
sub->base_user_tsc = thread->user_tsc;
|
|
}
|
|
sub->user_accum_count = 0;
|
|
|
|
if (sub->stopped_system_tsc) {
|
|
sub->base_system_tsc =
|
|
sub->stopped_system_tsc;
|
|
}
|
|
else {
|
|
sub->base_system_tsc =
|
|
thread->system_tsc;
|
|
}
|
|
sub->system_accum_count = 0;
|
|
}
|
|
else {
|
|
perf_event_read_value(sub);
|
|
ihk_atomic64_set(&sub->count, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
perf_stop(struct mc_perf_event *event)
|
|
{
|
|
int counter_id;
|
|
unsigned long counter_mask = 0;
|
|
struct mc_perf_event *leader = event->group_leader, *sub;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
struct mc_perf_event *stop_event[PMC_ALLOC_MAP_BITS + 1];
|
|
int stop_event_idx = 0;
|
|
|
|
stop_event[0] = NULL;
|
|
counter_id = leader->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id) &&
|
|
leader->state == PERF_EVENT_STATE_ACTIVE) {
|
|
if (leader->use_invariant_tsc) {
|
|
if (leader->stopped_user_tsc == 0) {
|
|
leader->stopped_user_tsc = thread->user_tsc;
|
|
}
|
|
if (leader->stopped_system_tsc == 0) {
|
|
leader->stopped_system_tsc = thread->system_tsc;
|
|
}
|
|
}
|
|
else {
|
|
counter_mask |= 1UL << counter_id;
|
|
stop_event[stop_event_idx++] = leader;
|
|
stop_event[stop_event_idx] = NULL;
|
|
}
|
|
|
|
leader->state = PERF_EVENT_STATE_INACTIVE;
|
|
}
|
|
|
|
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
|
|
counter_id = sub->counter_id;
|
|
if (ihk_mc_perf_counter_mask_check(1UL << counter_id) &&
|
|
sub->state == PERF_EVENT_STATE_ACTIVE) {
|
|
if (sub->use_invariant_tsc) {
|
|
if (sub->stopped_user_tsc == 0) {
|
|
sub->stopped_user_tsc =
|
|
thread->user_tsc;
|
|
}
|
|
if (sub->stopped_system_tsc == 0) {
|
|
sub->stopped_system_tsc =
|
|
thread->system_tsc;
|
|
}
|
|
}
|
|
else {
|
|
counter_mask |= 1UL << counter_id;
|
|
stop_event[stop_event_idx++] = sub;
|
|
stop_event[stop_event_idx] = NULL;
|
|
}
|
|
sub->state = PERF_EVENT_STATE_INACTIVE;
|
|
}
|
|
}
|
|
|
|
if (counter_mask) {
|
|
ihk_mc_perfctr_stop(counter_mask, 0);
|
|
stop_event_idx = 0;
|
|
while (stop_event[stop_event_idx]) {
|
|
ihk_mc_event_update(stop_event[stop_event_idx++]);
|
|
}
|
|
}
|
|
cpu_local_var(current)->proc->monitoring_event = NULL;
|
|
cpu_local_var(current)->proc->perf_status = PP_NONE;
|
|
}
|
|
|
|
static int
|
|
perf_ioctl(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
|
|
{
|
|
unsigned int cmd = ihk_mc_syscall_arg1(ctx);
|
|
struct mc_perf_event *event = (struct mc_perf_event*)sfd->data;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
struct process *proc;
|
|
|
|
switch (cmd) {
|
|
case PERF_EVENT_IOC_ENABLE:
|
|
if(event->pid == 0){
|
|
cpu_local_var(current)->proc->monitoring_event = event;
|
|
perf_start(event);
|
|
}
|
|
else if(event->pid > 0){
|
|
proc = find_process(event->pid, &lock);
|
|
if(!proc)
|
|
return -EINVAL;
|
|
if(proc->monitoring_event == NULL){
|
|
proc->monitoring_event = event;
|
|
proc->perf_status = PP_RESET;
|
|
}
|
|
process_unlock(proc, &lock);
|
|
}
|
|
break;
|
|
case PERF_EVENT_IOC_DISABLE:
|
|
if(event->pid == 0){
|
|
perf_stop(event);
|
|
}
|
|
// TODO: stop other process
|
|
/*
|
|
else if(event->pid > 0){
|
|
proc = find_process(event->pid, &lock);
|
|
if(!proc)
|
|
return -EINVAL;
|
|
proc->monitoring_event = NULL;
|
|
proc->perf_status = PP_NONE;
|
|
process_unlock(proc, &lock);
|
|
}
|
|
*/
|
|
break;
|
|
case PERF_EVENT_IOC_RESET:
|
|
// TODO: reset other process
|
|
perf_reset(event);
|
|
break;
|
|
case PERF_EVENT_IOC_REFRESH:
|
|
// TODO: refresh other process
|
|
|
|
// not supported on inherited events
|
|
if(event->attr.inherit)
|
|
return -EINVAL;
|
|
|
|
/* REFRESH doesn't need to include ENABLE */
|
|
/* perf_start(event); */
|
|
|
|
break;
|
|
default :
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
perf_close(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
|
|
{
|
|
struct mc_perf_event *event = (struct mc_perf_event*)sfd->data;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
thread->pmc_alloc_map &= ~(1UL << event->counter_id);
|
|
if (event->extra_reg.reg) {
|
|
thread->extra_reg_alloc_map &= ~(1UL << event->extra_reg.idx);
|
|
}
|
|
|
|
kfree(event);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
perf_fcntl(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
|
|
{
|
|
int cmd = ihk_mc_syscall_arg1(ctx);
|
|
long arg = ihk_mc_syscall_arg2(ctx);
|
|
int rc = 0;
|
|
|
|
switch(cmd) {
|
|
case 10: // F_SETSIG
|
|
sfd->sig_no = arg;
|
|
break;
|
|
case 0xf: // F_SETOWN_EX
|
|
break;
|
|
default :
|
|
break;
|
|
}
|
|
|
|
rc = syscall_generic_forwarding(__NR_fcntl, ctx);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long
|
|
perf_mmap(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
|
|
{
|
|
intptr_t addr0 = ihk_mc_syscall_arg0(ctx);
|
|
size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
int prot = ihk_mc_syscall_arg2(ctx);
|
|
int flags = ihk_mc_syscall_arg3(ctx);
|
|
int fd = ihk_mc_syscall_arg4(ctx);
|
|
off_t off0 = ihk_mc_syscall_arg5(ctx);
|
|
struct perf_event_mmap_page *page = NULL;
|
|
long rc;
|
|
|
|
flags |= MAP_ANONYMOUS;
|
|
prot |= PROT_WRITE;
|
|
rc = do_mmap(addr0, len0, prot, flags, fd, off0, 0, NULL);
|
|
|
|
// setup perf_event_mmap_page
|
|
page = (struct perf_event_mmap_page *)rc;
|
|
page->data_head = 16;
|
|
page->cap_user_rdpmc = 1;
|
|
|
|
return rc;
|
|
}
|
|
#endif /*ENABLE_PERF*/
|
|
|
|
struct vm_range_numa_policy *vm_range_policy_search(struct process_vm *vm, uintptr_t addr)
|
|
{
|
|
struct rb_root *root = &vm->vm_range_numa_policy_tree;
|
|
struct rb_node *node = root->rb_node;
|
|
struct vm_range_numa_policy *numa_policy = NULL;
|
|
|
|
while (node) {
|
|
numa_policy = rb_entry(node, struct vm_range_numa_policy, policy_rb_node);
|
|
if (addr < numa_policy->start) {
|
|
node = node->rb_left;
|
|
} else if (addr >= numa_policy->end) {
|
|
node = node->rb_right;
|
|
} else {
|
|
return numa_policy;
|
|
}
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
static int vm_policy_insert(struct process_vm *vm,
|
|
struct vm_range_numa_policy *newrange)
|
|
{
|
|
struct rb_root *root = &vm->vm_range_numa_policy_tree;
|
|
struct rb_node **new = &(root->rb_node), *parent = NULL;
|
|
struct vm_range_numa_policy *range;
|
|
|
|
while (*new) {
|
|
range = rb_entry(*new, struct vm_range_numa_policy,
|
|
policy_rb_node);
|
|
parent = *new;
|
|
if (newrange->end <= range->start) {
|
|
new = &((*new)->rb_left);
|
|
} else if (newrange->start >= range->end) {
|
|
new = &((*new)->rb_right);
|
|
} else {
|
|
ekprintf("%s(%p,%lx-%lx (nodemask)%lx (policy)%d): overlap %lx-%lx (nodemask)%lx (policy)%d\n",
|
|
__func__, vm, newrange->start,
|
|
newrange->end, newrange->numa_mask,
|
|
newrange->numa_mem_policy, range->start,
|
|
range->end, range->numa_mask,
|
|
range->numa_mem_policy);
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
|
|
dkprintf("%s: %p,%p: %lx-%lx (nodemask)%lx (policy)%d\n",
|
|
__func__, vm, newrange, newrange->start, newrange->end,
|
|
newrange->numa_mask, newrange->numa_mem_policy);
|
|
|
|
rb_link_node(&newrange->policy_rb_node, parent, new);
|
|
rb_insert_color(&newrange->policy_rb_node, root);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int vm_policy_clear_range(struct process_vm *vm,
|
|
unsigned long start, unsigned long end)
|
|
{
|
|
struct rb_root *root = &vm->vm_range_numa_policy_tree;
|
|
struct vm_range_numa_policy *range, *range_policy_iter;
|
|
struct vm_range_numa_policy *range_policy;
|
|
struct rb_node *node;
|
|
int error = 0;
|
|
|
|
/*
|
|
* Adjust overlapping range settings and add new one
|
|
* case: front part of new range overlaps existing one
|
|
* case: new range is a part of existing range
|
|
*/
|
|
range_policy_iter = vm_range_policy_search(vm, start);
|
|
if (range_policy_iter) {
|
|
int adjusted = 0;
|
|
unsigned long orig_end = range_policy_iter->end;
|
|
|
|
if (range_policy_iter->start == start &&
|
|
range_policy_iter->end == end) {
|
|
rb_erase(&range_policy_iter->policy_rb_node,
|
|
&vm->vm_range_numa_policy_tree);
|
|
kfree(range_policy_iter);
|
|
error = 0;
|
|
goto out;
|
|
}
|
|
|
|
/* Overlapping partially? */
|
|
if (range_policy_iter->start < start) {
|
|
orig_end = range_policy_iter->end;
|
|
range_policy_iter->end = start;
|
|
adjusted = 1;
|
|
}
|
|
|
|
/* Do we need to keep the end? */
|
|
if (orig_end > end) {
|
|
if (adjusted) {
|
|
/* Add a new entry after */
|
|
range_policy = kmalloc(
|
|
sizeof(struct vm_range_numa_policy),
|
|
IHK_MC_AP_NOWAIT);
|
|
if (!range_policy) {
|
|
dkprintf("%s: error allocating range_policy\n",
|
|
__func__);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
RB_CLEAR_NODE(&range_policy->policy_rb_node);
|
|
range_policy->start = end;
|
|
range_policy->end = orig_end;
|
|
range_policy->numa_mem_policy =
|
|
range_policy_iter->numa_mem_policy;
|
|
|
|
memcpy(range_policy->numa_mask,
|
|
&range_policy_iter->numa_mask,
|
|
sizeof(range_policy->numa_mask));
|
|
|
|
error = vm_policy_insert(vm, range_policy);
|
|
if (error) {
|
|
kprintf("%s: ERROR: could not insert range: %d\n",
|
|
__func__, error);
|
|
goto out;
|
|
}
|
|
}
|
|
else {
|
|
range_policy_iter->start = end;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Adjust overlapping range settings
|
|
* case: rear part of new range overlaps existing range
|
|
*/
|
|
range_policy_iter = vm_range_policy_search(vm, end - 1);
|
|
if (range_policy_iter) {
|
|
range_policy_iter->start = end;
|
|
}
|
|
|
|
/* Search fulliy contained range */
|
|
again_search:
|
|
for (node = rb_first(root); node; node = rb_next(node)) {
|
|
range = rb_entry(node, struct vm_range_numa_policy,
|
|
policy_rb_node);
|
|
|
|
/* existing range is fully contained */
|
|
if (range->start >= start && range->end <= end) {
|
|
rb_erase(&range->policy_rb_node,
|
|
&vm->vm_range_numa_policy_tree);
|
|
kfree(range);
|
|
goto again_search;
|
|
}
|
|
}
|
|
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
#ifdef ENABLE_PERF
|
|
static int mc_perf_event_alloc(struct mc_perf_event **out,
|
|
struct perf_event_attr *attr)
|
|
{
|
|
int ret = 0;
|
|
unsigned long val = 0, extra_config = 0;
|
|
struct mc_perf_event *event = NULL;
|
|
int ereg_id;
|
|
struct hw_perf_event *hwc;
|
|
|
|
if (!attr) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
event = kmalloc(sizeof(struct mc_perf_event), IHK_MC_AP_NOWAIT);
|
|
if (!event) {
|
|
ret = -ENOMEM;
|
|
goto out;
|
|
}
|
|
memset(event, 0, sizeof(struct mc_perf_event));
|
|
|
|
INIT_LIST_HEAD(&event->group_entry);
|
|
INIT_LIST_HEAD(&event->sibling_list);
|
|
event->attr = *attr;
|
|
|
|
event->sample_freq = attr->sample_freq;
|
|
event->nr_siblings = 0;
|
|
ihk_atomic64_set(&event->count, 0);
|
|
event->child_count_total = 0;
|
|
event->parent = NULL;
|
|
|
|
hwc = &event->hw;
|
|
hwc->sample_period = attr->sample_period;
|
|
if (attr->freq && attr->sample_freq) {
|
|
/*
|
|
* Mark struct perf_event_attr::sample_freq is set by user.
|
|
* Note that it's okay to use
|
|
* struct hw_perf_event::sample_period for this purpose
|
|
* because it's not union and not used when
|
|
* struct perf_event_attr::freq is one.
|
|
*/
|
|
hwc->sample_period = 1;
|
|
}
|
|
hwc->last_period = hwc->sample_period;
|
|
ihk_atomic64_set(&hwc->period_left, hwc->sample_period);
|
|
|
|
if (attr->type == PERF_TYPE_HARDWARE &&
|
|
attr->config == PERF_COUNT_HW_REF_CPU_CYCLES) {
|
|
event->use_invariant_tsc = 1;
|
|
|
|
/*
|
|
* REF_CPU_CYCLES is counted by thread's tsc.
|
|
* Always support.
|
|
*/
|
|
*out = event;
|
|
ret = 0;
|
|
goto out;
|
|
}
|
|
|
|
switch (attr->type) {
|
|
case PERF_TYPE_HARDWARE :
|
|
val = ihk_mc_hw_event_map(attr->config);
|
|
break;
|
|
case PERF_TYPE_HW_CACHE :
|
|
val = ihk_mc_hw_cache_event_map(attr->config);
|
|
extra_config = ihk_mc_hw_cache_extra_reg_map(attr->config);
|
|
break;
|
|
case PERF_TYPE_RAW :
|
|
val = ihk_mc_raw_event_map(attr->config);
|
|
break;
|
|
|
|
default:
|
|
// Unexpected type
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (!ihk_mc_validate_event(val)) {
|
|
ret = -ENOENT;
|
|
goto out;
|
|
}
|
|
|
|
event->hw_config = val;
|
|
event->hw_config_ext = extra_config;
|
|
|
|
ereg_id = ihk_mc_get_extra_reg_id(event->hw_config, event->hw_config_ext);
|
|
if (ereg_id >= 0) {
|
|
event->extra_reg.config = event->hw_config_ext;
|
|
event->extra_reg.reg = ihk_mc_get_extra_reg_msr(ereg_id);
|
|
event->extra_reg.idx = ihk_mc_get_extra_reg_idx(ereg_id);
|
|
}
|
|
|
|
ret = hw_perf_event_init(event);
|
|
|
|
*out = event;
|
|
|
|
out:
|
|
if (ret) {
|
|
kfree(event);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(perf_event_open)
|
|
{
|
|
int ret;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct mckfd *sfd, *cfd;
|
|
int fd;
|
|
int counter_idx;
|
|
long irqstate;
|
|
struct perf_event_attr *arg0 = (void *)ihk_mc_syscall_arg0(ctx);
|
|
int pid = ihk_mc_syscall_arg1(ctx);
|
|
int cpu = ihk_mc_syscall_arg2(ctx);
|
|
int group_fd = ihk_mc_syscall_arg3(ctx);
|
|
unsigned long flags = ihk_mc_syscall_arg4(ctx);
|
|
struct mc_perf_event *event;
|
|
struct perf_event_attr attr_user,*attr;
|
|
|
|
int not_supported_flag = 0;
|
|
|
|
#ifndef ENABLE_PERF
|
|
return -ENOSYS;
|
|
#endif // ENABLE_PERF
|
|
|
|
if (copy_from_user(&attr_user, arg0, sizeof(struct perf_event_attr))){
|
|
return -EFAULT;
|
|
}
|
|
attr = &attr_user;
|
|
|
|
// check Not supported
|
|
if (cpu > 0) {
|
|
not_supported_flag = 1;
|
|
}
|
|
if (flags > 0) {
|
|
not_supported_flag = 1;
|
|
}
|
|
|
|
if ((attr->type != PERF_TYPE_RAW) &&
|
|
(attr->type != PERF_TYPE_HARDWARE) &&
|
|
(attr->type != PERF_TYPE_HW_CACHE)) {
|
|
not_supported_flag = 1;
|
|
}
|
|
if (attr->read_format &
|
|
(PERF_FORMAT_TOTAL_TIME_ENABLED |
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING |
|
|
PERF_FORMAT_ID)) {
|
|
not_supported_flag = 1;
|
|
}
|
|
|
|
if (attr->freq) {
|
|
not_supported_flag = 1;
|
|
} else {
|
|
if (attr->sample_period & (1ULL << 63)) {
|
|
return -EINVAL;
|
|
}
|
|
}
|
|
|
|
if (not_supported_flag) {
|
|
return -ENOENT;
|
|
}
|
|
|
|
ret = mc_perf_event_alloc(&event, (struct perf_event_attr *)attr);
|
|
if (ret) {
|
|
return ret;
|
|
}
|
|
|
|
event->pid = pid;
|
|
|
|
counter_idx = ihk_mc_perfctr_alloc(thread, event);
|
|
if (counter_idx < 0) {
|
|
return counter_idx;
|
|
}
|
|
event->counter_id = counter_idx;
|
|
|
|
if (group_fd == -1) {
|
|
event->group_leader = event;
|
|
event->pmc_status = 0x0UL;
|
|
}
|
|
else {
|
|
for (cfd = proc->mckfd; cfd; cfd = cfd->next) {
|
|
if (cfd->fd == group_fd) {
|
|
event->group_leader = (struct mc_perf_event*)cfd->data;
|
|
list_add_tail(&event->group_entry, &event->group_leader->sibling_list);
|
|
event->group_leader->nr_siblings++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
event->group_leader->pmc_status |= (1UL << counter_idx);
|
|
|
|
request.number = __NR_perf_event_open;
|
|
request.args[0] = 0;
|
|
fd = do_syscall(&request, ihk_mc_get_processor_id());
|
|
if(fd < 0){
|
|
return fd;
|
|
}
|
|
|
|
thread->pmc_alloc_map |= 1UL << counter_idx;
|
|
|
|
sfd = kmalloc(sizeof(struct mckfd), IHK_MC_AP_NOWAIT);
|
|
if(!sfd)
|
|
return -ENOMEM;
|
|
sfd->fd = fd;
|
|
sfd->sig_no = -1;
|
|
sfd->read_cb = perf_read;
|
|
sfd->ioctl_cb = perf_ioctl;
|
|
sfd->close_cb = perf_close;
|
|
sfd->mmap_cb = perf_mmap;
|
|
sfd->fcntl_cb = perf_fcntl;
|
|
sfd->data = (long)event;
|
|
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
|
|
|
|
if(proc->mckfd == NULL) {
|
|
proc->mckfd = sfd;
|
|
sfd->next = NULL;
|
|
} else {
|
|
sfd->next = proc->mckfd;
|
|
proc->mckfd = sfd;
|
|
}
|
|
|
|
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
|
|
return sfd->fd;
|
|
}
|
|
#endif /* ENABLE_PERF */
|
|
|
|
SYSCALL_DECLARE(rt_sigtimedwait)
|
|
{
|
|
const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg0(ctx);
|
|
siginfo_t *info = (siginfo_t *)ihk_mc_syscall_arg1(ctx);
|
|
void *timeout = (void *)ihk_mc_syscall_arg2(ctx);
|
|
size_t sigsetsize = (size_t)ihk_mc_syscall_arg3(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
siginfo_t winfo;
|
|
__sigset_t bset;
|
|
__sigset_t wset;
|
|
__sigset_t nset;
|
|
struct timespec wtimeout;
|
|
struct sig_pending *pending;
|
|
struct list_head *head;
|
|
mcs_rwlock_lock_t *lock;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
__sigset_t w;
|
|
int sig;
|
|
struct timespec ats;
|
|
struct timespec ets;
|
|
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
|
|
|
monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY;
|
|
|
|
if (sigsetsize > sizeof(sigset_t))
|
|
return -EINVAL;
|
|
|
|
if(set == NULL)
|
|
return -EFAULT;
|
|
memset(&winfo, '\0', sizeof winfo);
|
|
if(copy_from_user(&wset, set, sizeof wset))
|
|
return -EFAULT;
|
|
if(timeout){
|
|
if(copy_from_user(&wtimeout, timeout, sizeof wtimeout))
|
|
return -EFAULT;
|
|
if(wtimeout.tv_nsec >= 1000000000L || wtimeout.tv_nsec < 0 ||
|
|
wtimeout.tv_sec < 0)
|
|
return -EINVAL;
|
|
if (!gettime_local_support &&
|
|
(wtimeout.tv_sec || wtimeout.tv_nsec)) {
|
|
return -EOPNOTSUPP;
|
|
}
|
|
}
|
|
|
|
wset &= ~__sigmask(SIGKILL);
|
|
wset &= ~__sigmask(SIGSTOP);
|
|
bset = thread->sigmask.__val[0];
|
|
thread->sigmask.__val[0] = bset | wset;
|
|
nset = ~(bset | wset);
|
|
|
|
if(timeout){
|
|
if (gettime_local_support) {
|
|
calculate_time_from_tsc(&ets);
|
|
ets.tv_sec += wtimeout.tv_sec;
|
|
ets.tv_nsec += wtimeout.tv_nsec;
|
|
if(ets.tv_nsec >= 1000000000L){
|
|
ets.tv_sec++;
|
|
ets.tv_nsec -= 1000000000L;
|
|
}
|
|
}
|
|
else {
|
|
memset(&ats, '\0', sizeof ats);
|
|
memset(&ets, '\0', sizeof ets);
|
|
}
|
|
}
|
|
|
|
thread->sigevent = 1;
|
|
for(;;){
|
|
while(thread->sigevent == 0){
|
|
thread->status = PS_INTERRUPTIBLE;
|
|
if(timeout){
|
|
if (gettime_local_support)
|
|
calculate_time_from_tsc(&ats);
|
|
if(ats.tv_sec > ets.tv_sec ||
|
|
(ats.tv_sec == ets.tv_sec &&
|
|
ats.tv_nsec >= ets.tv_nsec)){
|
|
return -EAGAIN;
|
|
}
|
|
}
|
|
|
|
cpu_pause();
|
|
}
|
|
/*
|
|
* Sending signal here is detected
|
|
* by the following list check
|
|
*/
|
|
thread->sigevent = 0;
|
|
|
|
thread->status = PS_RUNNING;
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(pending->sigmask.__val[0] & wset)
|
|
break;
|
|
}
|
|
|
|
if(&pending->list == head){
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(pending->sigmask.__val[0] & wset)
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(&pending->list != head){
|
|
list_del(&pending->list);
|
|
thread->sigmask.__val[0] = bset;
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
break;
|
|
}
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(pending->sigmask.__val[0] & nset)
|
|
break;
|
|
}
|
|
|
|
if(&pending->list == head){
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(pending->sigmask.__val[0] & nset)
|
|
break;
|
|
}
|
|
}
|
|
|
|
if(&pending->list != head){
|
|
list_del(&pending->list);
|
|
thread->sigmask.__val[0] = bset;
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
do_signal(-EINTR, NULL, thread, pending, -1);
|
|
return -EINTR;
|
|
}
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
}
|
|
|
|
if(info){
|
|
if(copy_to_user(info, &pending->info, sizeof(siginfo_t))){
|
|
kfree(pending);
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
|
|
kfree(pending);
|
|
|
|
return sig;
|
|
}
|
|
|
|
SYSCALL_DECLARE(rt_sigqueueinfo)
|
|
{
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
int sig = (int)ihk_mc_syscall_arg1(ctx);
|
|
void *winfo = (void *)ihk_mc_syscall_arg2(ctx);
|
|
struct siginfo info;
|
|
|
|
if(pid <= 0)
|
|
return -ESRCH;
|
|
|
|
if(copy_from_user(&info, winfo, sizeof info))
|
|
return -EFAULT;
|
|
|
|
return do_kill(cpu_local_var(current), pid, -1, sig, &info, 0);
|
|
}
|
|
|
|
static int
|
|
do_sigsuspend(struct thread *thread, const sigset_t *set)
|
|
{
|
|
__sigset_t wset;
|
|
__sigset_t bset;
|
|
struct sig_pending *pending;
|
|
struct list_head *head;
|
|
mcs_rwlock_lock_t *lock;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
|
|
|
monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY;
|
|
|
|
wset = set->__val[0];
|
|
wset &= ~__sigmask(SIGKILL);
|
|
wset &= ~__sigmask(SIGSTOP);
|
|
bset = thread->sigmask.__val[0];
|
|
thread->sigmask.__val[0] = wset;
|
|
|
|
thread->sigevent = 1;
|
|
for (;;) {
|
|
while (thread->sigevent == 0) {
|
|
int do_schedule = 0;
|
|
struct cpu_local_var *v;
|
|
long runq_irqstate;
|
|
|
|
thread->status = PS_INTERRUPTIBLE;
|
|
runq_irqstate = cpu_disable_interrupt_save();
|
|
ihk_mc_spinlock_lock_noirq(
|
|
&(get_this_cpu_local_var()->runq_lock));
|
|
v = get_this_cpu_local_var();
|
|
|
|
if (v->flags & CPU_FLAG_NEED_RESCHED) {
|
|
v->flags &= ~CPU_FLAG_NEED_RESCHED;
|
|
do_schedule = 1;
|
|
}
|
|
|
|
ihk_mc_spinlock_unlock_noirq(&v->runq_lock);
|
|
cpu_restore_interrupt(runq_irqstate);
|
|
|
|
if (do_schedule) {
|
|
schedule();
|
|
}
|
|
else {
|
|
cpu_pause();
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Sending signal here is detected
|
|
* by the following list check
|
|
*/
|
|
thread->sigevent = 0;
|
|
|
|
thread->status = PS_RUNNING;
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(!(pending->sigmask.__val[0] & wset))
|
|
break;
|
|
}
|
|
|
|
if(&pending->list == head){
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
list_for_each_entry(pending, head, list){
|
|
if(!(pending->sigmask.__val[0] & wset))
|
|
break;
|
|
}
|
|
}
|
|
if(&pending->list == head){
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
continue;
|
|
}
|
|
|
|
list_del(&pending->list);
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
thread->sigmask.__val[0] = bset;
|
|
do_signal(-EINTR, NULL, thread, pending, -1);
|
|
break;
|
|
}
|
|
return -EINTR;
|
|
}
|
|
|
|
SYSCALL_DECLARE(pause)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
return do_sigsuspend(thread, &thread->sigmask);
|
|
}
|
|
|
|
SYSCALL_DECLARE(rt_sigsuspend)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
const sigset_t *set = (const sigset_t *)ihk_mc_syscall_arg0(ctx);
|
|
size_t sigsetsize = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
sigset_t wset;
|
|
|
|
if (sigsetsize > sizeof(sigset_t))
|
|
return -EINVAL;
|
|
if(copy_from_user(&wset, set, sizeof wset))
|
|
return -EFAULT;
|
|
|
|
return do_sigsuspend(thread, &wset);
|
|
}
|
|
|
|
SYSCALL_DECLARE(rt_sigaction)
|
|
{
|
|
int sig = ihk_mc_syscall_arg0(ctx);
|
|
const struct sigaction *act =
|
|
(const struct sigaction *)ihk_mc_syscall_arg1(ctx);
|
|
struct sigaction *oact = (struct sigaction *)ihk_mc_syscall_arg2(ctx);
|
|
size_t sigsetsize = ihk_mc_syscall_arg3(ctx);
|
|
struct k_sigaction new_sa, old_sa;
|
|
int rc;
|
|
|
|
if (sigsetsize != sizeof(sigset_t))
|
|
return -EINVAL;
|
|
|
|
if (act) {
|
|
if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) {
|
|
goto fault;
|
|
}
|
|
}
|
|
|
|
rc = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL);
|
|
if (rc == 0 && oact) {
|
|
if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) {
|
|
goto fault;
|
|
}
|
|
}
|
|
|
|
return rc;
|
|
fault:
|
|
return -EFAULT;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sigaltstack)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
const stack_t *ss = (const stack_t *)ihk_mc_syscall_arg0(ctx);
|
|
stack_t *oss = (stack_t *)ihk_mc_syscall_arg1(ctx);
|
|
stack_t wss;
|
|
|
|
if(oss)
|
|
if(copy_to_user(oss, &thread->sigstack, sizeof wss))
|
|
return -EFAULT;
|
|
if(ss){
|
|
if(copy_from_user(&wss, ss, sizeof wss))
|
|
return -EFAULT;
|
|
if(wss.ss_flags != 0 && wss.ss_flags != SS_DISABLE)
|
|
return -EINVAL;
|
|
if(wss.ss_flags == SS_DISABLE){
|
|
thread->sigstack.ss_sp = NULL;
|
|
thread->sigstack.ss_flags = SS_DISABLE;
|
|
thread->sigstack.ss_size = 0;
|
|
}
|
|
else{
|
|
if(wss.ss_size < MINSIGSTKSZ)
|
|
return -ENOMEM;
|
|
|
|
memcpy(&thread->sigstack, &wss, sizeof wss);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(mincore)
|
|
{
|
|
const uintptr_t start = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len = ihk_mc_syscall_arg1(ctx);
|
|
uint8_t * const vec = (void *)ihk_mc_syscall_arg2(ctx);
|
|
const uintptr_t end = start + len;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
void *up;
|
|
uintptr_t addr;
|
|
struct vm_range *range;
|
|
uint8_t value;
|
|
int error;
|
|
pte_t *ptep;
|
|
|
|
if (start & (PAGE_SIZE - 1)) {
|
|
dkprintf("mincore(0x%lx,0x%lx,%p): EINVAL\n", start, len, vec);
|
|
return -EINVAL;
|
|
}
|
|
if ((start < vm->region.user_start)
|
|
|| (vm->region.user_end <= start)
|
|
|| ((vm->region.user_end - start) < len))
|
|
{
|
|
dkprintf("mincore(0x%lx,0x%lx,%p): EINVAL\n", start, len, vec);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
range = NULL;
|
|
up = vec;
|
|
for (addr = start; addr < end; addr += PAGE_SIZE) {
|
|
ihk_rwspinlock_read_lock_noirq(&vm->memory_range_lock);
|
|
range = lookup_process_memory_range(vm, addr, addr+1);
|
|
if (!range) {
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
dkprintf("mincore(0x%lx,0x%lx,%p):lookup failed. ENOMEM\n",
|
|
start, len, vec);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
|
|
ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
|
|
(void *)addr, 0, NULL, NULL, NULL);
|
|
if (ptep && pte_is_present(ptep)) {
|
|
value = 1;
|
|
}
|
|
else if (range->memobj) {
|
|
error = memobj_lookup_page(range->memobj,
|
|
range->objoff + (addr - range->start),
|
|
PAGE_P2ALIGN, NULL, NULL);
|
|
value = (!error)? 1: 0;
|
|
}
|
|
else {
|
|
value = 0;
|
|
}
|
|
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
|
|
error = copy_to_user(up, &value, sizeof(value));
|
|
if (error) {
|
|
dkprintf("mincore(0x%lx,0x%lx,%p):copy failed. %d\n",
|
|
start, len, vec, error);
|
|
return error;
|
|
}
|
|
++up;
|
|
}
|
|
|
|
dkprintf("mincore(0x%lx,0x%lx,%p): 0\n", start, len, vec);
|
|
return 0;
|
|
} /* sys_mincore() */
|
|
|
|
static int
|
|
set_memory_range_flag(struct vm_range *range, unsigned long arg)
|
|
{
|
|
range->flag |= arg;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
clear_memory_range_flag(struct vm_range *range, unsigned long arg)
|
|
{
|
|
range->flag &= ~arg;
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
change_attr_process_memory_range(struct process_vm *vm,
|
|
uintptr_t start, uintptr_t end,
|
|
int (*change_proc)(struct vm_range *,
|
|
unsigned long),
|
|
unsigned long arg)
|
|
{
|
|
uintptr_t addr;
|
|
int error;
|
|
struct vm_range *range;
|
|
struct vm_range *prev;
|
|
struct vm_range *next;
|
|
int join_flag = 0;
|
|
|
|
error = 0;
|
|
range = lookup_process_memory_range(vm, start, start + PAGE_SIZE);
|
|
if(!range){
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
prev = previous_process_memory_range(vm, range);
|
|
if(!prev)
|
|
prev = range;
|
|
for (addr = start; addr < end; addr = range->start) {
|
|
if (range->start < addr) {
|
|
if((error = split_process_memory_range(vm, range, addr, &range))) {
|
|
break;
|
|
}
|
|
}
|
|
if (end < range->end) {
|
|
if((error = split_process_memory_range(vm, range, end, NULL))) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if((error = change_proc(range, arg)) != 0){
|
|
break;
|
|
}
|
|
range = next_process_memory_range(vm, range);
|
|
}
|
|
|
|
if(error == 0){
|
|
next = next_process_memory_range(vm, range);
|
|
if(!next)
|
|
next = range;
|
|
}
|
|
else{
|
|
next = range;
|
|
}
|
|
|
|
while(prev != next){
|
|
int wkerr;
|
|
|
|
range = next_process_memory_range(vm, prev);
|
|
if(!range)
|
|
break;
|
|
wkerr = join_process_memory_range(vm, prev, range);
|
|
if(range == next)
|
|
join_flag = 1;
|
|
if (wkerr) {
|
|
if(join_flag)
|
|
break;
|
|
prev = range;
|
|
}
|
|
}
|
|
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(madvise)
|
|
{
|
|
const uintptr_t start = (uintptr_t)ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
const int advice = (int)ihk_mc_syscall_arg2(ctx);
|
|
size_t len;
|
|
uintptr_t end;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
struct vm_range *first;
|
|
uintptr_t addr;
|
|
struct vm_range *range;
|
|
int error;
|
|
uintptr_t s;
|
|
uintptr_t e;
|
|
|
|
dkprintf("[%d]sys_madvise(%lx,%lx,%x)\n",
|
|
ihk_mc_get_processor_id(), start, len0, advice);
|
|
return 0;
|
|
|
|
len = (len0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
end = start + len;
|
|
|
|
if ((start & (PAGE_SIZE - 1))
|
|
|| (len < len0)
|
|
|| (end < start)) {
|
|
error = -EINVAL;
|
|
goto out2;
|
|
}
|
|
|
|
if ((start < region->user_start)
|
|
|| (region->user_end <= start)
|
|
|| (len > (region->user_end - region->user_start))
|
|
|| ((region->user_end - len) < start)) {
|
|
error = -ENOMEM;
|
|
goto out2;
|
|
}
|
|
|
|
error = 0;
|
|
switch (advice) {
|
|
default:
|
|
case MADV_MERGEABLE:
|
|
case MADV_UNMERGEABLE:
|
|
error = -EINVAL;
|
|
break;
|
|
|
|
case MADV_HUGEPAGE:
|
|
case MADV_NOHUGEPAGE:
|
|
case MADV_NORMAL:
|
|
case MADV_RANDOM:
|
|
case MADV_SEQUENTIAL:
|
|
case MADV_WILLNEED:
|
|
case MADV_DONTNEED:
|
|
case MADV_DONTFORK:
|
|
case MADV_DOFORK:
|
|
case MADV_REMOVE:
|
|
case MADV_DONTDUMP:
|
|
case MADV_DODUMP:
|
|
case MADV_WIPEONFORK:
|
|
case MADV_KEEPONFORK:
|
|
break;
|
|
|
|
case MADV_HWPOISON:
|
|
case MADV_SOFT_OFFLINE:
|
|
error = -EPERM;
|
|
break;
|
|
|
|
}
|
|
if (error) {
|
|
goto out2;
|
|
}
|
|
|
|
if (start == end) {
|
|
error = 0;
|
|
goto out2;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
/* check contiguous map */
|
|
first = NULL;
|
|
range = NULL; /* for avoidance of warning */
|
|
for (addr = start; addr < end; addr = range->end) {
|
|
if (first == NULL) {
|
|
range = lookup_process_memory_range(thread->vm, start, start+PAGE_SIZE);
|
|
first = range;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, range);
|
|
}
|
|
|
|
if ((range == NULL) || (addr < range->start)) {
|
|
/* not contiguous */
|
|
dkprintf("[%d]sys_madvise(%lx,%lx,%x):not contig "
|
|
"%lx [%lx-%lx)\n",
|
|
ihk_mc_get_processor_id(), start,
|
|
len0, advice, addr, range?range->start:0,
|
|
range?range->end:0);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (advice == MADV_REMOVE) {
|
|
if (range->flag & VR_LOCKED) {
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (!range->memobj || !memobj_is_removable(range->memobj)) {
|
|
dkprintf("sys_madvise(%lx,%lx,%x):"
|
|
"not removable [%lx-%lx)\n",
|
|
start, len0, advice,
|
|
range->start, range->end);
|
|
error = -EACCES;
|
|
goto out;
|
|
}
|
|
}
|
|
else if(advice == MADV_DONTFORK || advice == MADV_DOFORK);
|
|
else if (advice == MADV_DONTDUMP || advice == MADV_DODUMP) {
|
|
}
|
|
else if (advice == MADV_NORMAL) {
|
|
/*
|
|
* Normally, the settings of MADV_RANDOM and
|
|
* MADV_SEQUENTIAL are cleared.
|
|
* MADV_RANDOM and MADV_SEQUENTIAL are not supported,
|
|
* so do nothing.
|
|
*/
|
|
}
|
|
else if (advice == MADV_WIPEONFORK
|
|
|| advice == MADV_KEEPONFORK) {
|
|
if (range->memobj && memobj_has_pager(range->memobj)) {
|
|
/* device mapping, file mapping */
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
if (!(range->flag & VR_PRIVATE)) {
|
|
/* VR_SHARED */
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
else if (!range->memobj || !memobj_has_pager(range->memobj)) {
|
|
dkprintf("[%d]sys_madvise(%lx,%lx,%x):has not pager"
|
|
"[%lx-%lx) %lx\n",
|
|
ihk_mc_get_processor_id(), start,
|
|
len0, advice, range->start,
|
|
range->end, range->memobj);
|
|
error = -EBADF;
|
|
goto out;
|
|
}
|
|
|
|
if ((advice == MADV_DONTNEED)
|
|
&& (range->flag & VR_LOCKED)) {
|
|
dkprintf("[%d]sys_madvise(%lx,%lx,%x):locked"
|
|
"[%lx-%lx) %lx\n",
|
|
ihk_mc_get_processor_id(), start,
|
|
len0, advice, range->start,
|
|
range->end, range->flag);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* only hugetlbfs and shm map support hugepage */
|
|
if ((advice == MADV_HUGEPAGE || advice == MADV_NOHUGEPAGE)
|
|
&& !(range->memobj->flags & (MF_HUGETLBFS | MF_SHM))) {
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
s = start;
|
|
if (s < range->start) {
|
|
s = range->start;
|
|
}
|
|
e = end;
|
|
if (range->end < e) {
|
|
e = range->end;
|
|
}
|
|
|
|
if (advice == MADV_REMOVE) {
|
|
error = invalidate_process_memory_range(
|
|
thread->vm, range, s, e);
|
|
if (error) {
|
|
kprintf("sys_madvise(%lx,%lx,%x):[%lx-%lx):"
|
|
"invalidate failed. %d\n",
|
|
start, len0, advice,
|
|
range->start, range->end,
|
|
error);
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
|
|
if(advice == MADV_DONTFORK){
|
|
error = change_attr_process_memory_range(thread->vm, start, end,
|
|
set_memory_range_flag,
|
|
VR_DONTFORK);
|
|
if(error){
|
|
goto out;
|
|
}
|
|
}
|
|
if(advice == MADV_DOFORK){
|
|
error = change_attr_process_memory_range(thread->vm, start, end,
|
|
clear_memory_range_flag,
|
|
VR_DONTFORK);
|
|
if(error){
|
|
goto out;
|
|
}
|
|
}
|
|
if(advice == MADV_DONTDUMP){
|
|
error = change_attr_process_memory_range(thread->vm, start, end,
|
|
set_memory_range_flag,
|
|
VR_DONTDUMP);
|
|
if(error){
|
|
goto out;
|
|
}
|
|
}
|
|
if(advice == MADV_DODUMP){
|
|
error = change_attr_process_memory_range(thread->vm, start, end,
|
|
clear_memory_range_flag,
|
|
VR_DONTDUMP);
|
|
if(error){
|
|
goto out;
|
|
}
|
|
}
|
|
if(advice == MADV_DONTFORK ||
|
|
advice == MADV_DOFORK){
|
|
error = syscall_generic_forwarding(__NR_madvise, ctx);
|
|
}
|
|
if (advice == MADV_WIPEONFORK) {
|
|
error = change_attr_process_memory_range(
|
|
thread->vm, start, end,
|
|
set_memory_range_flag,
|
|
VR_WIPEONFORK);
|
|
if (error) {
|
|
goto out;
|
|
}
|
|
}
|
|
if (advice == MADV_KEEPONFORK) {
|
|
error = change_attr_process_memory_range(
|
|
thread->vm, start, end,
|
|
clear_memory_range_flag,
|
|
VR_WIPEONFORK);
|
|
if (error) {
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
out2:
|
|
dkprintf("[%d]sys_madvise(%lx,%lx,%x): %d\n",
|
|
ihk_mc_get_processor_id(), start, len0, advice, error);
|
|
return error;
|
|
}
|
|
|
|
struct kshmid_ds {
|
|
int destroy;
|
|
int padding;
|
|
struct shmobj *obj;
|
|
struct memobj *memobj;
|
|
struct list_head chain;
|
|
};
|
|
|
|
unsigned long shmid_index[512];
|
|
|
|
static int get_shmid_max_index(void)
|
|
{
|
|
int i;
|
|
int index = -1;
|
|
|
|
for (i = 511; i >= 0; i--) {
|
|
if (shmid_index[i]) {
|
|
index = i * 64 + 63 - __builtin_clzl(shmid_index[i]);
|
|
break;
|
|
}
|
|
}
|
|
return index;
|
|
}
|
|
|
|
static int get_shmid_index(void)
|
|
{
|
|
int index = get_shmid_max_index();
|
|
int i;
|
|
unsigned long x;
|
|
|
|
for (index = 0;; index++) {
|
|
i = index / 64;
|
|
x = 1UL << (index % 64);
|
|
if (!(shmid_index[i] & x)) {
|
|
shmid_index[i] |= x;
|
|
break;
|
|
}
|
|
}
|
|
return index;
|
|
}
|
|
|
|
LIST_HEAD(kds_list);
|
|
struct shminfo the_shminfo = {
|
|
.shmmax = 64L * 1024 * 1024 * 1024,
|
|
.shmmin = 1,
|
|
.shmmni = 4 * 1024,
|
|
.shmall = 4L * 1024 * 1024 * 1024,
|
|
};
|
|
struct shm_info the_shm_info = { 0, };
|
|
|
|
static int make_shmid(struct shmobj *obj)
|
|
{
|
|
return ((int)obj->index << 16) | obj->ds.shm_perm.seq;
|
|
} /* make_shmid() */
|
|
|
|
static int shmid_to_index(int shmid)
|
|
{
|
|
return (shmid >> 16);
|
|
} /* shmid_to_index() */
|
|
|
|
static int shmid_to_seq(int shmid)
|
|
{
|
|
return (shmid & ((1 << 16) - 1));
|
|
} /* shmid_to_seq() */
|
|
|
|
int shmobj_list_lookup(int shmid, struct shmobj **objp)
|
|
{
|
|
int index;
|
|
int seq;
|
|
struct shmobj *obj;
|
|
|
|
index = shmid_to_index(shmid);
|
|
seq = shmid_to_seq(shmid);
|
|
|
|
list_for_each_entry(obj, &kds_list, chain) {
|
|
if (obj->index == index) {
|
|
break;
|
|
}
|
|
}
|
|
if (&obj->chain == &kds_list) {
|
|
return -EINVAL;
|
|
}
|
|
if (obj->ds.shm_perm.seq != seq) {
|
|
return -EIDRM;
|
|
}
|
|
|
|
memobj_ref(&obj->memobj);
|
|
*objp = obj;
|
|
return 0;
|
|
} /* shmobj_list_lookup() */
|
|
|
|
int shmobj_list_lookup_by_key(key_t key, struct shmobj **objp)
|
|
{
|
|
struct shmobj *obj;
|
|
|
|
list_for_each_entry(obj, &kds_list, chain) {
|
|
if (obj->ds.shm_perm.key == key &&
|
|
!(obj->ds.shm_perm.mode & SHM_DEST)) {
|
|
break;
|
|
}
|
|
}
|
|
if (&obj->chain == &kds_list) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
memobj_ref(&obj->memobj);
|
|
*objp = obj;
|
|
return 0;
|
|
} /* shmobj_list_lookup_by_key() */
|
|
|
|
int shmobj_list_lookup_by_index(int index, struct shmobj **objp)
|
|
{
|
|
struct shmobj *obj;
|
|
|
|
list_for_each_entry(obj, &kds_list, chain) {
|
|
if (obj->index == index) {
|
|
break;
|
|
}
|
|
}
|
|
if (&obj->chain == &kds_list) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
memobj_ref(&obj->memobj);
|
|
*objp = obj;
|
|
return 0;
|
|
} /* shmobj_list_lookup_by_index() */
|
|
|
|
int do_shmget(const key_t key, const size_t size, const int shmflg)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
time_t now = time();
|
|
int shmid;
|
|
int error;
|
|
struct shmid_ds ads;
|
|
struct shmobj *obj;
|
|
int pgshift;
|
|
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x)\n", key, size, shmflg);
|
|
|
|
if (size < the_shminfo.shmmin) {
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -EINVAL\n", key, size, shmflg);
|
|
return -EINVAL;
|
|
}
|
|
|
|
shmobj_list_lock();
|
|
obj = NULL;
|
|
if (key != IPC_PRIVATE) {
|
|
error = shmobj_list_lookup_by_key(key, &obj);
|
|
if (error == -EINVAL) {
|
|
obj = NULL;
|
|
}
|
|
else if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): lookup: %d\n", key, size, shmflg, error);
|
|
return error;
|
|
}
|
|
if (!obj && !(shmflg & IPC_CREAT)) {
|
|
shmobj_list_unlock();
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -ENOENT\n", key, size, shmflg);
|
|
return -ENOENT;
|
|
}
|
|
if (obj && (shmflg & IPC_CREAT) && (shmflg & IPC_EXCL)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -EEXIST\n", key, size, shmflg);
|
|
return -EEXIST;
|
|
}
|
|
}
|
|
|
|
if (obj) {
|
|
if (proc->euid) {
|
|
int req;
|
|
|
|
req = (shmflg | (shmflg << 3) | (shmflg << 6)) & 0700;
|
|
if ((obj->ds.shm_perm.uid == proc->euid)
|
|
|| (obj->ds.shm_perm.cuid == proc->euid)) {
|
|
/* nothing to do */
|
|
}
|
|
else if ((obj->ds.shm_perm.gid == proc->egid)
|
|
|| (obj->ds.shm_perm.cgid == proc->egid)) {
|
|
/*
|
|
* XXX: need to check supplementary group IDs
|
|
*/
|
|
req >>= 3;
|
|
}
|
|
else {
|
|
req >>= 6;
|
|
}
|
|
if (req & ~obj->ds.shm_perm.mode) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -EINVAL\n", key, size, shmflg);
|
|
return -EACCES;
|
|
}
|
|
}
|
|
if (obj->ds.shm_segsz < size) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -EINVAL\n", key, size, shmflg);
|
|
return -EINVAL;
|
|
}
|
|
shmid = make_shmid(obj);
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): %d\n", key, size, shmflg, shmid);
|
|
return shmid;
|
|
}
|
|
|
|
if (the_shm_info.used_ids >= the_shminfo.shmmni) {
|
|
shmobj_list_unlock();
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): -ENOSPC\n", key, size, shmflg);
|
|
return -ENOSPC;
|
|
}
|
|
|
|
if (shmflg & SHM_HUGETLB) {
|
|
pgshift = (shmflg >> SHM_HUGE_SHIFT) & 0x3F;
|
|
if (!pgshift) {
|
|
pgshift = ihk_mc_get_linux_default_huge_page_shift();
|
|
}
|
|
} else if (proc->thp_disable) {
|
|
pgshift = PAGE_SHIFT;
|
|
} else {
|
|
/* transparent huge page */
|
|
size_t pgsize;
|
|
int p2align;
|
|
|
|
if (size > PAGE_SIZE) {
|
|
error = arch_get_smaller_page_size(NULL, size + 1,
|
|
&pgsize, &p2align);
|
|
if (error) {
|
|
ekprintf("%s: WARNING: arch_get_smaller_page_size failed. size: %ld, error: %d\n",
|
|
__func__, size, error);
|
|
pgshift = PAGE_SHIFT;
|
|
} else {
|
|
pgshift = p2align + PAGE_SHIFT;
|
|
}
|
|
} else {
|
|
pgshift = PAGE_SHIFT;
|
|
}
|
|
}
|
|
|
|
memset(&ads, 0, sizeof(ads));
|
|
ads.shm_perm.key = key;
|
|
ads.shm_perm.uid = proc->euid;
|
|
ads.shm_perm.cuid = proc->euid;
|
|
ads.shm_perm.gid = proc->egid;
|
|
ads.shm_perm.cgid = proc->egid;
|
|
ads.shm_perm.mode = shmflg & 0777;
|
|
ads.shm_segsz = size;
|
|
ads.shm_ctime = now;
|
|
ads.shm_cpid = proc->pid;
|
|
ads.init_pgshift = pgshift;
|
|
|
|
error = shmobj_create_indexed(&ads, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): shmobj_create: %d\n", key, size, shmflg, error);
|
|
return error;
|
|
}
|
|
|
|
obj->index = get_shmid_index();
|
|
|
|
list_add(&obj->chain, &kds_list);
|
|
++the_shm_info.used_ids;
|
|
|
|
shmid = make_shmid(obj);
|
|
shmobj_list_unlock();
|
|
|
|
dkprintf("do_shmget(%#lx,%#lx,%#x): %d\n", key, size, shmflg, shmid);
|
|
return shmid;
|
|
} /* do_shmget()() */
|
|
|
|
SYSCALL_DECLARE(shmat)
|
|
{
|
|
const int shmid = ihk_mc_syscall_arg0(ctx);
|
|
void * const shmaddr = (void *)ihk_mc_syscall_arg1(ctx);
|
|
const int shmflg = ihk_mc_syscall_arg2(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct process_vm *vm = thread->vm;
|
|
size_t len;
|
|
int error;
|
|
uintptr_t addr;
|
|
int prot;
|
|
int vrflags;
|
|
int req;
|
|
struct shmobj *obj;
|
|
size_t pgsize;
|
|
|
|
dkprintf("shmat(%#x,%p,%#x)\n", shmid, shmaddr, shmflg);
|
|
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmat(%#x,%p,%#x): lookup: %d\n", shmid, shmaddr, shmflg, error);
|
|
return error;
|
|
}
|
|
|
|
pgsize = (size_t)1 << obj->pgshift;
|
|
if (shmaddr && ((uintptr_t)shmaddr & (pgsize - 1)) && !(shmflg & SHM_RND)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmat(%#x,%p,%#x): -EINVAL\n", shmid, shmaddr, shmflg);
|
|
return -EINVAL;
|
|
}
|
|
addr = (uintptr_t)shmaddr & ~(pgsize - 1);
|
|
len = obj->real_segsz;
|
|
|
|
prot = PROT_READ;
|
|
req = 4;
|
|
if (!(shmflg & SHM_RDONLY)) {
|
|
prot |= PROT_WRITE;
|
|
req |= 2;
|
|
}
|
|
|
|
if (!proc->euid) {
|
|
req = 0;
|
|
}
|
|
else if ((proc->euid == obj->ds.shm_perm.uid)
|
|
|| (proc->euid == obj->ds.shm_perm.cuid)) {
|
|
req <<= 6;
|
|
}
|
|
else if ((proc->egid == obj->ds.shm_perm.gid)
|
|
|| (proc->egid == obj->ds.shm_perm.cgid)) {
|
|
req <<= 3;
|
|
}
|
|
else {
|
|
req <<= 0;
|
|
}
|
|
if (~obj->ds.shm_perm.mode & req) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmat(%#x,%p,%#x): -EINVAL\n", shmid, shmaddr, shmflg);
|
|
return -EACCES;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
|
|
|
|
if (addr) {
|
|
if (lookup_process_memory_range(vm, addr, addr+len)) {
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmat(%#x,%p,%#x):lookup_process_memory_range succeeded. -ENOMEM\n", shmid, shmaddr, shmflg);
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
else {
|
|
error = search_free_space(len, obj->pgshift, &addr);
|
|
if (error) {
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmat(%#x,%p,%#x):search_free_space failed. %d\n", shmid, shmaddr, shmflg, error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
vrflags = VR_NONE;
|
|
vrflags |= VR_DEMAND_PAGING;
|
|
vrflags |= PROT_TO_VR_FLAG(prot);
|
|
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
|
|
|
|
if (!(prot & PROT_WRITE)) {
|
|
error = set_host_vma(addr, len, PROT_READ | PROT_EXEC, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmat(%#x,%p,%#x):set_host_vma failed. %d\n", shmid, shmaddr, shmflg, error);
|
|
return error;
|
|
}
|
|
}
|
|
|
|
error = add_process_memory_range(vm, addr, addr+len, -1,
|
|
vrflags, &obj->memobj, 0, obj->pgshift, NULL, NULL);
|
|
if (error) {
|
|
if (!(prot & PROT_WRITE)) {
|
|
(void)set_host_vma(addr, len, PROT_READ | PROT_WRITE | PROT_EXEC, 1/* holding memory_range_lock */);
|
|
}
|
|
memobj_unref(&obj->memobj);
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
shmobj_list_unlock();
|
|
dkprintf("shmat(%#x,%p,%#x):add_process_memory_range failed. %d\n", shmid, shmaddr, shmflg, error);
|
|
return error;
|
|
}
|
|
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
shmobj_list_unlock();
|
|
|
|
dkprintf("shmat(%#x,%p,%#x): 0x%lx. %d\n", shmid, shmaddr, shmflg, addr);
|
|
return addr;
|
|
} /* sys_shmat() */
|
|
|
|
SYSCALL_DECLARE(shmctl)
|
|
{
|
|
const int shmid = ihk_mc_syscall_arg0(ctx);
|
|
const int cmd = ihk_mc_syscall_arg1(ctx);
|
|
struct shmid_ds * const buf = (void *)ihk_mc_syscall_arg2(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
int error;
|
|
struct shmid_ds ads;
|
|
time_t now = time();
|
|
int req;
|
|
int maxi;
|
|
struct shmobj *obj;
|
|
struct rlimit *rlim;
|
|
size_t size;
|
|
struct shmlock_user *user;
|
|
uid_t ruid = proc->ruid;
|
|
uint16_t oldmode;
|
|
|
|
dkprintf("shmctl(%#x,%d,%p)\n", shmid, cmd, buf);
|
|
switch (cmd) {
|
|
case IPC_RMID:
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
if (!has_cap_sys_admin(thread)
|
|
&& (obj->ds.shm_perm.uid != proc->euid)
|
|
&& (obj->ds.shm_perm.cuid != proc->euid)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): -EPERM\n", shmid, cmd, buf);
|
|
return -EPERM;
|
|
}
|
|
oldmode = obj->ds.shm_perm.mode;
|
|
obj->ds.shm_perm.mode |= SHM_DEST;
|
|
shmobj_list_unlock();
|
|
// unref twice if this is the first time rmid is called
|
|
if (!(oldmode & SHM_DEST))
|
|
memobj_unref(&obj->memobj);
|
|
memobj_unref(&obj->memobj);
|
|
|
|
dkprintf("shmctl(%#x,%d,%p): 0\n", shmid, cmd, buf);
|
|
return 0;
|
|
case IPC_SET:
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
if ((obj->ds.shm_perm.uid != proc->euid)
|
|
&& (obj->ds.shm_perm.cuid != proc->euid)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): -EPERM\n", shmid, cmd, buf);
|
|
return -EPERM;
|
|
}
|
|
error = copy_from_user(&ads, buf, sizeof(ads));
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
obj->ds.shm_perm.uid = ads.shm_perm.uid;
|
|
obj->ds.shm_perm.gid = ads.shm_perm.gid;
|
|
obj->ds.shm_perm.mode &= ~0777;
|
|
obj->ds.shm_perm.mode |= ads.shm_perm.mode & 0777;
|
|
obj->ds.shm_ctime = now;
|
|
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): 0\n", shmid, cmd, buf);
|
|
return 0;
|
|
case IPC_STAT:
|
|
case SHM_STAT:
|
|
shmobj_list_lock();
|
|
if (cmd == IPC_STAT) {
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
} else { // SHM_STAT
|
|
error = shmobj_list_lookup_by_index(shmid, &obj);
|
|
}
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
|
|
if (cmd == IPC_STAT) {
|
|
if (!proc->euid) {
|
|
req = 0;
|
|
} else if ((proc->euid == obj->ds.shm_perm.uid) ||
|
|
(proc->euid == obj->ds.shm_perm.cuid)) {
|
|
req = 0400;
|
|
} else if ((proc->egid == obj->ds.shm_perm.gid) ||
|
|
(proc->egid == obj->ds.shm_perm.cgid)) {
|
|
req = 0040;
|
|
} else {
|
|
req = 0004;
|
|
}
|
|
if (req & ~obj->ds.shm_perm.mode) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): -EACCES\n", shmid,
|
|
cmd, buf);
|
|
return -EACCES;
|
|
}
|
|
}
|
|
|
|
/* This could potentially be higher than required if some other
|
|
* thread holds a ref at this point.
|
|
* Minus one here is because we hold a ref...
|
|
*/
|
|
obj->ds.shm_nattch = ihk_atomic_read(&obj->memobj.refcnt) - 1;
|
|
/* ... And one for sentinel unless RMID has been called */
|
|
if (!(obj->ds.shm_perm.mode & SHM_DEST)) {
|
|
obj->ds.shm_nattch--;
|
|
}
|
|
|
|
error = copy_to_user(buf, &obj->ds, sizeof(*buf));
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): 0\n", shmid, cmd, buf);
|
|
return 0;
|
|
case IPC_INFO:
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
error = copy_to_user(buf, &the_shminfo, sizeof(the_shminfo));
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
|
|
maxi = get_shmid_max_index();
|
|
if (maxi < 0) {
|
|
maxi = 0;
|
|
}
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, maxi);
|
|
return maxi;
|
|
case SHM_LOCK:
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
if (!has_cap_ipc_lock(thread)
|
|
&& (obj->ds.shm_perm.cuid != proc->euid)
|
|
&& (obj->ds.shm_perm.uid != proc->euid)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): perm shm: %d\n", shmid, cmd, buf, error);
|
|
return -EPERM;
|
|
}
|
|
rlim = &proc->rlimit[MCK_RLIMIT_MEMLOCK];
|
|
if (!rlim->rlim_cur && !has_cap_ipc_lock(thread)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): perm proc: %d\n", shmid, cmd, buf, error);
|
|
return -EPERM;
|
|
}
|
|
if (!(obj->ds.shm_perm.mode & SHM_LOCKED)
|
|
&& ((obj->pgshift == 0)
|
|
|| (obj->pgshift == PAGE_SHIFT))) {
|
|
shmlock_users_lock();
|
|
error = shmlock_user_get(ruid, &user);
|
|
if (error) {
|
|
shmlock_users_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
shmobj_list_unlock();
|
|
ekprintf("shmctl(%#x,%d,%p): user lookup: %d\n", shmid, cmd, buf, error);
|
|
return -ENOMEM;
|
|
}
|
|
size = obj->real_segsz;
|
|
if (!has_cap_ipc_lock(thread)
|
|
&& (rlim->rlim_cur != (rlim_t)-1)
|
|
&& ((rlim->rlim_cur < user->locked)
|
|
|| ((rlim->rlim_cur - user->locked) < size))) {
|
|
shmlock_users_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): too large: %d\n", shmid, cmd, buf, error);
|
|
return -ENOMEM;
|
|
}
|
|
obj->ds.shm_perm.mode |= SHM_LOCKED;
|
|
obj->user = user;
|
|
user->locked += size;
|
|
shmlock_users_unlock();
|
|
}
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
|
|
dkprintf("shmctl(%#x,%d,%p): 0\n", shmid, cmd, buf);
|
|
return 0;
|
|
case SHM_UNLOCK:
|
|
shmobj_list_lock();
|
|
error = shmobj_list_lookup(shmid, &obj);
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): lookup: %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
if (!has_cap_ipc_lock(thread)
|
|
&& (obj->ds.shm_perm.cuid != proc->euid)
|
|
&& (obj->ds.shm_perm.uid != proc->euid)) {
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): perm shm: %d\n", shmid, cmd, buf, error);
|
|
return -EPERM;
|
|
}
|
|
if ((obj->ds.shm_perm.mode & SHM_LOCKED)
|
|
&& ((obj->pgshift == 0)
|
|
|| (obj->pgshift == PAGE_SHIFT))) {
|
|
size = obj->real_segsz;
|
|
shmlock_users_lock();
|
|
user = obj->user;
|
|
obj->user = NULL;
|
|
user->locked -= size;
|
|
if (!user->locked) {
|
|
shmlock_user_free(user);
|
|
}
|
|
shmlock_users_unlock();
|
|
obj->ds.shm_perm.mode &= ~SHM_LOCKED;
|
|
}
|
|
shmobj_list_unlock();
|
|
memobj_unref(&obj->memobj);
|
|
dkprintf("shmctl(%#x,%d,%p): 0\n", shmid, cmd, buf);
|
|
return 0;
|
|
case SHM_INFO:
|
|
shmobj_list_lock();
|
|
error = copy_to_user(buf, &the_shm_info, sizeof(the_shm_info));
|
|
if (error) {
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, error);
|
|
return error;
|
|
}
|
|
|
|
maxi = get_shmid_max_index();
|
|
if (maxi < 0) {
|
|
maxi = 0;
|
|
}
|
|
shmobj_list_unlock();
|
|
dkprintf("shmctl(%#x,%d,%p): %d\n", shmid, cmd, buf, maxi);
|
|
return maxi;
|
|
default:
|
|
dkprintf("shmctl(%#x,%d,%p): EINVAL\n", shmid, cmd, buf);
|
|
return -EINVAL;
|
|
}
|
|
} /* sys_shmctl() */
|
|
|
|
SYSCALL_DECLARE(shmdt)
|
|
{
|
|
void * const shmaddr = (void *)ihk_mc_syscall_arg0(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
struct vm_range *range;
|
|
int error;
|
|
|
|
dkprintf("shmdt(%p)\n", shmaddr);
|
|
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
|
|
range = lookup_process_memory_range(vm, (uintptr_t)shmaddr, (uintptr_t)shmaddr+1);
|
|
if (!range || (range->start != (uintptr_t)shmaddr) || !range->memobj
|
|
|| !(range->memobj->flags & MF_SHMDT_OK)) {
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
dkprintf("shmdt(%p): -EINVAL\n", shmaddr);
|
|
return -EINVAL;
|
|
}
|
|
|
|
error = do_munmap((void *)range->start, (range->end - range->start), 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
dkprintf("shmdt(%p): %d\n", shmaddr, error);
|
|
return error;
|
|
}
|
|
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
dkprintf("shmdt(%p): 0\n", shmaddr);
|
|
return 0;
|
|
} /* sys_shmdt() */
|
|
|
|
long do_futex(int n, unsigned long arg0, unsigned long arg1,
|
|
unsigned long arg2, unsigned long arg3,
|
|
unsigned long arg4, unsigned long arg5,
|
|
unsigned long _uti_clv,
|
|
void *uti_futex_resp,
|
|
void *_linux_wait_event,
|
|
void *_linux_printk,
|
|
void *_linux_clock_gettime)
|
|
{
|
|
struct cpu_local_var *uti_clv = (struct cpu_local_var *)_uti_clv;
|
|
uint64_t timeout = 0; // No timeout
|
|
uint32_t val2 = 0;
|
|
// Only one clock is used, ignore FUTEX_CLOCK_REALTIME
|
|
//int futex_clock_realtime = 0;
|
|
int fshared = 1;
|
|
int ret = 0;
|
|
|
|
uint32_t *uaddr = (uint32_t *)arg0;
|
|
int op = (int)arg1;
|
|
uint32_t val = (uint32_t)arg2;
|
|
struct timespec *utime = (struct timespec*)arg3;
|
|
uint32_t *uaddr2 = (uint32_t *)arg4;
|
|
uint32_t val3 = (uint32_t)arg5;
|
|
int flags = op;
|
|
|
|
|
|
/* TODO: replace these with passing via struct smp_boot_param */
|
|
if (_linux_printk && !linux_printk) {
|
|
linux_printk = (int (*)(const char *fmt, ...))_linux_printk;
|
|
}
|
|
if (_linux_wait_event && !linux_wait_event) {
|
|
linux_wait_event = (long (*)(void *_resp, unsigned long nsec_timeout))_linux_wait_event;
|
|
}
|
|
if (_linux_clock_gettime && !linux_clock_gettime) {
|
|
linux_clock_gettime = (int (*)(clockid_t clk_id, struct timespec *tp))_linux_clock_gettime;
|
|
}
|
|
|
|
/* Fill in clv */
|
|
if (uti_clv) {
|
|
uti_clv->uti_futex_resp = uti_futex_resp;
|
|
}
|
|
|
|
/* monitor is per-cpu object */
|
|
if (!uti_clv) {
|
|
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
|
monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY;
|
|
}
|
|
|
|
/* Cross-address space futex? */
|
|
if (op & FUTEX_PRIVATE_FLAG) {
|
|
fshared = 0;
|
|
}
|
|
op = (op & FUTEX_CMD_MASK);
|
|
|
|
dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d\n",
|
|
flags,
|
|
(op == FUTEX_WAIT) ? "FUTEX_WAIT" :
|
|
(op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" :
|
|
(op == FUTEX_WAKE) ? "FUTEX_WAKE" :
|
|
(op == FUTEX_WAKE_OP) ? "FUTEX_WAKE_OP" :
|
|
(op == FUTEX_WAKE_BITSET) ? "FUTEX_WAKE_BITSET" :
|
|
(op == FUTEX_CMP_REQUEUE) ? "FUTEX_CMP_REQUEUE" :
|
|
(op == FUTEX_REQUEUE) ? "FUTEX_REQUEUE (NOT IMPL!)" : "unknown",
|
|
(unsigned long)uaddr, val, utime, uaddr2, val3, *uaddr, fshared);
|
|
|
|
if ((op == FUTEX_WAIT || op == FUTEX_WAIT_BITSET) && utime) {
|
|
dkprintf("%s: utime=%ld.%09ld\n",
|
|
__func__, utime->tv_sec, utime->tv_nsec);
|
|
}
|
|
if (utime && (op == FUTEX_WAIT_BITSET || op == FUTEX_WAIT)) {
|
|
unsigned long nsec_timeout;
|
|
if (!uti_clv) {
|
|
/* Use cycles for non-UTI case */
|
|
|
|
/* As per the Linux implementation FUTEX_WAIT specifies the duration of
|
|
* the timeout, while FUTEX_WAIT_BITSET specifies the absolute timestamp */
|
|
if (op == FUTEX_WAIT_BITSET) {
|
|
struct timespec ats;
|
|
|
|
if (!gettime_local_support ||
|
|
!(flags & FUTEX_CLOCK_REALTIME)) {
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct timespec tv[2];
|
|
struct timespec *tv_now = tv;
|
|
|
|
if ((((unsigned long)tv) ^
|
|
((unsigned long)(tv + 1))) & PAGE_MASK)
|
|
tv_now = tv + 1;
|
|
|
|
request.number = n;
|
|
request.args[0] = virt_to_phys(tv_now);
|
|
request.args[1] = (flags & FUTEX_CLOCK_REALTIME)?
|
|
CLOCK_REALTIME: CLOCK_MONOTONIC;
|
|
|
|
int r = do_syscall(&request,
|
|
ihk_mc_get_processor_id());
|
|
|
|
if (r < 0) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
ats.tv_sec = tv_now->tv_sec;
|
|
ats.tv_nsec = tv_now->tv_nsec;
|
|
}
|
|
/* Compute timeout based on TSC/nanosec ratio */
|
|
else {
|
|
calculate_time_from_tsc(&ats);
|
|
}
|
|
|
|
nsec_timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec) -
|
|
(ats.tv_sec * NS_PER_SEC + ats.tv_nsec);
|
|
}
|
|
else {
|
|
nsec_timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec);
|
|
}
|
|
timeout = nsec_timeout * 1000 / ihk_mc_get_ns_per_tsc();
|
|
|
|
}
|
|
else{
|
|
if (op == FUTEX_WAIT_BITSET) { /* User passed absolute time */
|
|
struct timespec ats;
|
|
ret = (*linux_clock_gettime)((flags & FUTEX_CLOCK_REALTIME) ? CLOCK_REALTIME: CLOCK_MONOTONIC, &ats);
|
|
if (ret) {
|
|
return ret;
|
|
}
|
|
dkprintf("%s: ats=%ld.%09ld\n",
|
|
__func__, ats.tv_sec, ats.tv_nsec);
|
|
/* Use nsec for UTI case */
|
|
timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec) -
|
|
(ats.tv_sec * NS_PER_SEC + ats.tv_nsec);
|
|
} else { /* User passed relative time */
|
|
/* Use nsec for UTI case */
|
|
timeout = (utime->tv_sec * NS_PER_SEC + utime->tv_nsec);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Requeue parameter in 'utime' if op == FUTEX_CMP_REQUEUE.
|
|
* number of waiters to wake in 'utime' if op == FUTEX_WAKE_OP. */
|
|
if (op == FUTEX_CMP_REQUEUE || op == FUTEX_WAKE_OP)
|
|
val2 = (uint32_t) (unsigned long) arg3;
|
|
|
|
ret = futex(uaddr, op, val, timeout, uaddr2, val2, val3, fshared);
|
|
|
|
dkprintf("futex op=[%x, %s],uaddr=%lx, val=%x, utime=%lx, uaddr2=%lx, val3=%x, []=%x, shared: %d, ret: %d\n",
|
|
op,
|
|
(op == FUTEX_WAIT) ? "FUTEX_WAIT" :
|
|
(op == FUTEX_WAIT_BITSET) ? "FUTEX_WAIT_BITSET" :
|
|
(op == FUTEX_WAKE) ? "FUTEX_WAKE" :
|
|
(op == FUTEX_WAKE_OP) ? "FUTEX_WAKE_OP" :
|
|
(op == FUTEX_WAKE_BITSET) ? "FUTEX_WAKE_BITSET" :
|
|
(op == FUTEX_CMP_REQUEUE) ? "FUTEX_CMP_REQUEUE" :
|
|
(op == FUTEX_REQUEUE) ? "FUTEX_REQUEUE (NOT IMPL!)" : "unknown",
|
|
(unsigned long)uaddr, val, utime, uaddr2, val3, *uaddr, fshared, ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(futex)
|
|
{
|
|
return do_futex(n, ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
|
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
|
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_arg5(ctx),
|
|
0UL, NULL, NULL, NULL, NULL);
|
|
}
|
|
|
|
static void
|
|
do_exit(int code)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct thread *child;
|
|
struct process *proc = thread->proc;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
int nproc;
|
|
int exit_status = (code >> 8) & 255;
|
|
int sig = code & 255;
|
|
struct timespec ats;
|
|
|
|
dkprintf("sys_exit,pid=%d\n", proc->pid);
|
|
|
|
/* XXX: for if all threads issued the exit(2) rather than exit_group(2),
|
|
* exit(2) also should delegate.
|
|
*/
|
|
/* If there is a clear_child_tid address set, clear it and wake it.
|
|
* This unblocks any pthread_join() waiters. */
|
|
if (thread->clear_child_tid) {
|
|
|
|
dkprintf("exit clear_child!\n");
|
|
|
|
setint_user((int*)thread->clear_child_tid, 0);
|
|
barrier();
|
|
futex((uint32_t *)thread->clear_child_tid,
|
|
FUTEX_WAKE, 1, 0, NULL, 0, 0, 1);
|
|
thread->clear_child_tid = NULL;
|
|
}
|
|
|
|
mcs_rwlock_reader_lock(&proc->threads_lock, &lock);
|
|
nproc = 0;
|
|
list_for_each_entry(child, &proc->threads_list, siblings_list) {
|
|
if (child->status != PS_EXITED &&
|
|
child->status != PS_ZOMBIE)
|
|
nproc++;
|
|
}
|
|
|
|
if (nproc == 1) { // process has only one thread
|
|
mcs_rwlock_reader_unlock(&proc->threads_lock, &lock);
|
|
terminate(exit_status, sig);
|
|
return;
|
|
}
|
|
|
|
#ifdef DCFA_KMOD
|
|
do_mod_exit((int)ihk_mc_syscall_arg0(ctx));
|
|
#endif
|
|
|
|
if(proc->status == PS_EXITED){
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
terminate(exit_status, 0);
|
|
return;
|
|
}
|
|
preempt_disable();
|
|
thread->exit_status = code;
|
|
thread->status = PS_EXITED;
|
|
tsc_to_ts(thread->user_tsc, &ats);
|
|
ts_add(&proc->utime, &ats);
|
|
tsc_to_ts(thread->system_tsc, &ats);
|
|
ts_add(&proc->stime, &ats);
|
|
thread->user_tsc = 0;
|
|
thread->system_tsc = 0;
|
|
thread_exit_signal(thread);
|
|
sync_child_event(thread->proc->monitoring_event);
|
|
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
|
|
release_thread(thread);
|
|
preempt_enable();
|
|
|
|
schedule();
|
|
|
|
return;
|
|
}
|
|
|
|
SYSCALL_DECLARE(exit)
|
|
{
|
|
int exit_status = ((int)ihk_mc_syscall_arg0(ctx)) & 255;
|
|
|
|
do_exit(exit_status << 8);
|
|
return 0;
|
|
}
|
|
|
|
static int rlimits[] = {
|
|
#ifdef RLIMIT_AS
|
|
RLIMIT_AS, MCK_RLIMIT_AS,
|
|
#endif
|
|
#ifdef RLIMIT_CORE
|
|
RLIMIT_CORE, MCK_RLIMIT_CORE,
|
|
#endif
|
|
#ifdef RLIMIT_CPU
|
|
RLIMIT_CPU, MCK_RLIMIT_CPU,
|
|
#endif
|
|
#ifdef RLIMIT_DATA
|
|
RLIMIT_DATA, MCK_RLIMIT_DATA,
|
|
#endif
|
|
#ifdef RLIMIT_FSIZE
|
|
RLIMIT_FSIZE, MCK_RLIMIT_FSIZE,
|
|
#endif
|
|
#ifdef RLIMIT_LOCKS
|
|
RLIMIT_LOCKS, MCK_RLIMIT_LOCKS,
|
|
#endif
|
|
#ifdef RLIMIT_MEMLOCK
|
|
RLIMIT_MEMLOCK, MCK_RLIMIT_MEMLOCK,
|
|
#endif
|
|
#ifdef RLIMIT_MSGQUEUE
|
|
RLIMIT_MSGQUEUE,MCK_RLIMIT_MSGQUEUE,
|
|
#endif
|
|
#ifdef RLIMIT_NICE
|
|
RLIMIT_NICE, MCK_RLIMIT_NICE,
|
|
#endif
|
|
#ifdef RLIMIT_NOFILE
|
|
RLIMIT_NOFILE, MCK_RLIMIT_NOFILE,
|
|
#endif
|
|
#ifdef RLIMIT_NPROC
|
|
RLIMIT_NPROC, MCK_RLIMIT_NPROC,
|
|
#endif
|
|
#ifdef RLIMIT_RSS
|
|
RLIMIT_RSS, MCK_RLIMIT_RSS,
|
|
#endif
|
|
#ifdef RLIMIT_RTPRIO
|
|
RLIMIT_RTPRIO, MCK_RLIMIT_RTPRIO,
|
|
#endif
|
|
#ifdef RLIMIT_RTTIME
|
|
RLIMIT_RTTIME, MCK_RLIMIT_RTTIME,
|
|
#endif
|
|
#ifdef RLIMIT_SIGPENDING
|
|
RLIMIT_SIGPENDING,MCK_RLIMIT_SIGPENDING,
|
|
#endif
|
|
#ifdef RLIMIT_STACK
|
|
RLIMIT_STACK, MCK_RLIMIT_STACK,
|
|
#endif
|
|
};
|
|
|
|
static int do_prlimit64(int pid, int resource, struct rlimit *_new_limit,
|
|
struct rlimit *old_limit)
|
|
{
|
|
struct rlimit new_limit;
|
|
int resource_found;
|
|
int i;
|
|
int mcresource;
|
|
struct process *proc;
|
|
struct resource_set *rset = cpu_local_var(resource_set);
|
|
int hash;
|
|
struct process_hash *phash = rset->process_hash;
|
|
struct mcs_rwlock_node exist_lock;
|
|
struct mcs_rwlock_node update_lock;
|
|
unsigned long irqstate;
|
|
int found;
|
|
int ret;
|
|
ihk_mc_user_context_t ctx;
|
|
|
|
if (resource < 0 || resource >= RLIMIT_NLIMITS) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (_new_limit) {
|
|
if (copy_from_user(&new_limit, _new_limit,
|
|
sizeof(struct rlimit))) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (new_limit.rlim_cur > new_limit.rlim_max) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* update Linux side value as well */
|
|
switch (resource) {
|
|
case RLIMIT_FSIZE:
|
|
case RLIMIT_NOFILE:
|
|
case RLIMIT_LOCKS:
|
|
case RLIMIT_MSGQUEUE:
|
|
ihk_mc_syscall_arg0(&ctx) = pid;
|
|
ihk_mc_syscall_arg1(&ctx) = resource;
|
|
ihk_mc_syscall_arg2(&ctx) =
|
|
(unsigned long)_new_limit;
|
|
ihk_mc_syscall_arg3(&ctx) =
|
|
(unsigned long)old_limit;
|
|
ret = syscall_generic_forwarding(__NR_prlimit64, &ctx);
|
|
if (ret < 0)
|
|
return ret;
|
|
break;
|
|
}
|
|
}
|
|
|
|
/* translate resource */
|
|
resource_found = 0;
|
|
for (i = 0; i < sizeof(rlimits) / sizeof(int); i += 2) {
|
|
if (rlimits[i] == resource) {
|
|
mcresource = rlimits[i + 1];
|
|
resource_found = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!resource_found) {
|
|
ihk_mc_syscall_arg0(&ctx) = pid;
|
|
ihk_mc_syscall_arg1(&ctx) = resource;
|
|
ihk_mc_syscall_arg2(&ctx) =
|
|
(unsigned long)_new_limit;
|
|
ihk_mc_syscall_arg3(&ctx) =
|
|
(unsigned long)old_limit;
|
|
return syscall_generic_forwarding(__NR_prlimit64, &ctx);
|
|
}
|
|
|
|
/* find process */
|
|
found = 0;
|
|
|
|
if (pid == 0) {
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
pid = thread->proc->pid;
|
|
}
|
|
|
|
irqstate = cpu_disable_interrupt_save();
|
|
hash = process_hash(pid);
|
|
mcs_rwlock_reader_lock_noirq(&phash->lock[hash], &exist_lock);
|
|
|
|
list_for_each_entry(proc, &phash->list[hash], hash_list) {
|
|
if (proc->pid == pid) {
|
|
found = 1;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!found) {
|
|
mcs_rwlock_reader_unlock_noirq(&phash->lock[hash], &exist_lock);
|
|
cpu_restore_interrupt(irqstate);
|
|
return -ESRCH;
|
|
}
|
|
|
|
if (_new_limit) {
|
|
mcs_rwlock_writer_lock_noirq(&proc->update_lock, &update_lock);
|
|
} else {
|
|
mcs_rwlock_reader_lock_noirq(&proc->update_lock, &update_lock);
|
|
}
|
|
|
|
if (old_limit) {
|
|
if (copy_to_user(old_limit, proc->rlimit + mcresource,
|
|
sizeof(struct rlimit))) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (_new_limit) {
|
|
memcpy(proc->rlimit + mcresource, &new_limit,
|
|
sizeof(struct rlimit));
|
|
}
|
|
|
|
ret = 0;
|
|
out:
|
|
if (_new_limit) {
|
|
mcs_rwlock_writer_unlock_noirq(&proc->update_lock,
|
|
&update_lock);
|
|
} else {
|
|
mcs_rwlock_reader_unlock_noirq(&proc->update_lock,
|
|
&update_lock);
|
|
}
|
|
|
|
mcs_rwlock_reader_unlock_noirq(&phash->lock[hash], &exist_lock);
|
|
cpu_restore_interrupt(irqstate);
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(setrlimit)
|
|
{
|
|
int resource = ihk_mc_syscall_arg0(ctx);
|
|
struct rlimit *new_limit = (struct rlimit *)ihk_mc_syscall_arg1(ctx);
|
|
|
|
return do_prlimit64(0, resource, new_limit, NULL);
|
|
}
|
|
|
|
SYSCALL_DECLARE(getrlimit)
|
|
{
|
|
int resource = ihk_mc_syscall_arg0(ctx);
|
|
struct rlimit *old_limit = (struct rlimit *)ihk_mc_syscall_arg1(ctx);
|
|
|
|
return do_prlimit64(0, resource, NULL, old_limit);
|
|
}
|
|
|
|
SYSCALL_DECLARE(prlimit64)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
int resource = ihk_mc_syscall_arg1(ctx);
|
|
struct rlimit *new_limit = (struct rlimit *)ihk_mc_syscall_arg2(ctx);
|
|
struct rlimit *old_limit = (struct rlimit *)ihk_mc_syscall_arg3(ctx);
|
|
|
|
return do_prlimit64(pid, resource, new_limit, old_limit);
|
|
}
|
|
|
|
SYSCALL_DECLARE(getrusage)
|
|
{
|
|
int who = ihk_mc_syscall_arg0(ctx);
|
|
struct rusage *usage = (struct rusage *)ihk_mc_syscall_arg1(ctx);
|
|
struct rusage kusage;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct timespec utime;
|
|
struct timespec stime;
|
|
struct mcs_rwlock_node lock;
|
|
struct timespec ats;
|
|
|
|
if(who != RUSAGE_SELF &&
|
|
who != RUSAGE_CHILDREN &&
|
|
who != RUSAGE_THREAD)
|
|
return -EINVAL;
|
|
|
|
memset(&kusage, '\0', sizeof kusage);
|
|
|
|
if(who == RUSAGE_SELF){
|
|
struct thread *child;
|
|
|
|
memset(&utime, '\0', sizeof utime);
|
|
memset(&stime, '\0', sizeof stime);
|
|
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
|
|
list_for_each_entry(child, &proc->threads_list, siblings_list){
|
|
if(child != thread &&
|
|
child->status == PS_RUNNING &&
|
|
!child->in_kernel){
|
|
child->times_update = 0;
|
|
ihk_mc_interrupt_cpu(child->cpu_id,
|
|
ihk_mc_get_vector(IHK_GV_IKC));
|
|
}
|
|
else
|
|
child->times_update = 1;
|
|
}
|
|
utime.tv_sec = proc->utime.tv_sec;
|
|
utime.tv_nsec = proc->utime.tv_nsec;
|
|
stime.tv_sec = proc->stime.tv_sec;
|
|
stime.tv_nsec = proc->stime.tv_nsec;
|
|
list_for_each_entry(child, &proc->threads_list, siblings_list){
|
|
while(!child->times_update)
|
|
cpu_pause();
|
|
tsc_to_ts(child->user_tsc, &ats);
|
|
ts_add(&utime, &ats);
|
|
tsc_to_ts(child->system_tsc, &ats);
|
|
ts_add(&stime, &ats);
|
|
}
|
|
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
|
|
ts_to_tv(&kusage.ru_utime, &utime);
|
|
ts_to_tv(&kusage.ru_stime, &stime);
|
|
|
|
kusage.ru_maxrss = proc->maxrss / 1024;
|
|
}
|
|
else if(who == RUSAGE_CHILDREN){
|
|
ts_to_tv(&kusage.ru_utime, &proc->utime_children);
|
|
ts_to_tv(&kusage.ru_stime, &proc->stime_children);
|
|
|
|
kusage.ru_maxrss = proc->maxrss_children / 1024;
|
|
}
|
|
else if(who == RUSAGE_THREAD){
|
|
tsc_to_ts(thread->user_tsc, &ats);
|
|
ts_to_tv(&kusage.ru_utime, &ats);
|
|
tsc_to_ts(thread->system_tsc, &ats);
|
|
ts_to_tv(&kusage.ru_stime, &ats);
|
|
|
|
kusage.ru_maxrss = proc->maxrss / 1024;
|
|
}
|
|
|
|
if(copy_to_user(usage, &kusage, sizeof kusage))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sysinfo)
|
|
{
|
|
struct sysinfo *sysinfo = (struct sysinfo *)ihk_mc_syscall_arg0(ctx);
|
|
struct sysinfo __sysinfo;
|
|
int ret = 0;
|
|
|
|
memset(&__sysinfo, '\0', sizeof(struct sysinfo));
|
|
|
|
__sysinfo.totalram = rusage_get_total_memory();
|
|
__sysinfo.freeram = rusage_get_free_memory();
|
|
__sysinfo.mem_unit = 1; // always one unit for McKernel
|
|
|
|
if (copy_to_user(sysinfo, &__sysinfo, sizeof(struct sysinfo))) {
|
|
ret = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
extern int ptrace_traceme(void);
|
|
extern void set_single_step(struct thread *thread);
|
|
|
|
static int ptrace_wakeup_sig(int pid, long request, long data) {
|
|
dkprintf("ptrace_wakeup_sig,pid=%d,data=%08x\n", pid, data);
|
|
int error = 0;
|
|
struct thread *child;
|
|
struct siginfo info;
|
|
struct mcs_rwlock_node_irqsave lock;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
child = find_thread(pid, pid);
|
|
if (!child) {
|
|
error = -ESRCH;
|
|
goto out;
|
|
}
|
|
|
|
if (data > 64 || data < 0) {
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
switch (request) {
|
|
case PTRACE_KILL:
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = SIGKILL;
|
|
error = do_kill(thread, pid, -1, SIGKILL, &info, 0);
|
|
if (error < 0) {
|
|
goto out;
|
|
}
|
|
break;
|
|
case PTRACE_CONT:
|
|
case PTRACE_SINGLESTEP:
|
|
case PTRACE_SYSCALL:
|
|
if (request == PTRACE_SINGLESTEP) {
|
|
set_single_step(child);
|
|
}
|
|
mcs_rwlock_writer_lock(&child->proc->update_lock, &lock);
|
|
child->ptrace &= ~PT_TRACE_SYSCALL;
|
|
if (request == PTRACE_SYSCALL) {
|
|
child->ptrace |= PT_TRACE_SYSCALL;
|
|
}
|
|
mcs_rwlock_writer_unlock(&child->proc->update_lock, &lock);
|
|
if(data != 0 && data != SIGSTOP) {
|
|
|
|
/* TODO: Tracing process replace the original
|
|
signal with "data" */
|
|
if (request == PTRACE_CONT && child->ptrace_sendsig) {
|
|
memcpy(&info, &child->ptrace_sendsig->info, sizeof info);
|
|
kfree(child->ptrace_sendsig);
|
|
child->ptrace_sendsig = NULL;
|
|
}
|
|
else if (request == PTRACE_CONT && child->ptrace_recvsig) {
|
|
memcpy(&info, &child->ptrace_recvsig->info, sizeof info);
|
|
kfree(child->ptrace_recvsig);
|
|
child->ptrace_recvsig = NULL;
|
|
}
|
|
else {
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = data;
|
|
info.si_code = SI_USER;
|
|
info._sifields._kill.si_pid = thread->proc->pid;
|
|
}
|
|
error = do_kill(thread, pid, -1, data, &info, 1);
|
|
if (error < 0) {
|
|
goto out;
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
sched_wakeup_thread(child, PS_TRACED | PS_STOPPED);
|
|
out:
|
|
if(child)
|
|
thread_unlock(child);
|
|
return error;
|
|
}
|
|
|
|
extern long ptrace_read_user(struct thread *thread, long addr, unsigned long *value);
|
|
extern long ptrace_write_user(struct thread *thread, long addr, unsigned long value);
|
|
|
|
static long ptrace_pokeuser(int pid, long addr, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
if(addr > sizeof(struct user) - 8 || addr < 0)
|
|
return -EFAULT;
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
rc = ptrace_write_user(child, addr, (unsigned long)data);
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_peekuser(int pid, long addr, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
unsigned long *p = (unsigned long *)data;
|
|
|
|
if(addr > sizeof(struct user) - 8|| addr < 0)
|
|
return -EFAULT;
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
unsigned long value;
|
|
rc = ptrace_read_user(child, addr, &value);
|
|
if (rc == 0) {
|
|
rc = copy_to_user(p, (char *)&value, sizeof(value));
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_getregs(int pid, long data)
|
|
{
|
|
struct user_regs_struct *regs = (struct user_regs_struct *)data;
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
struct user_regs_struct user_regs;
|
|
long addr;
|
|
unsigned long *p;
|
|
memset(&user_regs, '\0', sizeof(struct user_regs_struct));
|
|
for (addr = 0, p = (unsigned long *)&user_regs;
|
|
addr < sizeof(struct user_regs_struct);
|
|
addr += sizeof(*p), p++) {
|
|
rc = ptrace_read_user(child, addr, p);
|
|
if (rc) break;
|
|
}
|
|
if (rc == 0) {
|
|
rc = copy_to_user(regs, &user_regs, sizeof(struct user_regs_struct));
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_setregs(int pid, long data)
|
|
{
|
|
struct user_regs_struct *regs = (struct user_regs_struct *)data;
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
struct user_regs_struct user_regs;
|
|
rc = copy_from_user(&user_regs, regs, sizeof(struct user_regs_struct));
|
|
if (rc == 0) {
|
|
long addr;
|
|
unsigned long *p;
|
|
for (addr = 0, p = (unsigned long *)&user_regs;
|
|
addr < sizeof(struct user_regs_struct);
|
|
addr += sizeof(*p), p++) {
|
|
rc = ptrace_write_user(child, addr, *p);
|
|
if (rc) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
extern long ptrace_read_fpregs(struct thread *thread, void *fpregs);
|
|
extern long ptrace_write_fpregs(struct thread *thread, void *fpregs);
|
|
|
|
static long ptrace_getfpregs(int pid, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
rc = ptrace_read_fpregs(child, (void *)data);
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_setfpregs(int pid, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
rc = ptrace_write_fpregs(child, (void *)data);
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
extern long ptrace_read_regset(struct thread *thread, long type, struct iovec *iov);
|
|
extern long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov);
|
|
|
|
static long ptrace_getregset(int pid, long type, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
struct iovec iov;
|
|
|
|
rc = copy_from_user(&iov, (struct iovec *)data, sizeof(iov));
|
|
if (rc == 0) {
|
|
rc = ptrace_read_regset(child, type, &iov);
|
|
}
|
|
if (rc == 0) {
|
|
rc = copy_to_user(&((struct iovec *)data)->iov_len,
|
|
&iov.iov_len, sizeof(iov.iov_len));
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_setregset(int pid, long type, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
struct iovec iov;
|
|
|
|
rc = copy_from_user(&iov, (struct iovec *)data, sizeof(iov));
|
|
if (rc == 0) {
|
|
rc = ptrace_write_regset(child, type, &iov);
|
|
}
|
|
if (rc == 0) {
|
|
rc = copy_to_user(&((struct iovec *)data)->iov_len,
|
|
&iov.iov_len, sizeof(iov.iov_len));
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_peektext(int pid, long addr, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
unsigned long *p = (unsigned long *)data;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
unsigned long value;
|
|
rc = read_process_vm(child->vm, &value, (void *)addr, sizeof(value));
|
|
if (rc != 0) {
|
|
dkprintf("ptrace_peektext: bad area addr=0x%llx\n", addr);
|
|
} else {
|
|
rc = copy_to_user(p, &value, sizeof(value));
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long ptrace_poketext(int pid, long addr, long data)
|
|
{
|
|
long rc = -EIO;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child)
|
|
return -ESRCH;
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
rc = patch_process_vm(child->vm, (void *)addr, &data, sizeof(data));
|
|
if (rc) {
|
|
dkprintf("ptrace_poketext: bad address 0x%llx\n", addr);
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static int ptrace_setoptions(int pid, int flags)
|
|
{
|
|
int ret;
|
|
struct thread *child;
|
|
|
|
/* Only supported options are enabled.
|
|
* Following options are pretended to be supported for the time being:
|
|
* PTRACE_O_TRACESYSGOOD
|
|
* PTRACE_O_TRACEFORK
|
|
* PTRACE_O_TRACEVFORK
|
|
* PTRACE_O_TRACECLONE
|
|
* PTRACE_O_TRACEEXEC
|
|
* PTRACE_O_TRACEVFORKDONE
|
|
*/
|
|
if (flags & ~(PTRACE_O_TRACESYSGOOD|
|
|
PTRACE_O_TRACEFORK|
|
|
PTRACE_O_TRACEVFORK|
|
|
PTRACE_O_TRACECLONE|
|
|
PTRACE_O_TRACEEXEC|
|
|
PTRACE_O_TRACEVFORKDONE|
|
|
PTRACE_O_TRACEEXIT)) {
|
|
kprintf("ptrace_setoptions: not supported flag %x\n", flags);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child || !child->proc || !(child->ptrace & PT_TRACED)) {
|
|
ret = -ESRCH;
|
|
goto unlockout;
|
|
}
|
|
|
|
child->ptrace &= ~PTRACE_O_MASK; /* PT_TRACE_EXEC remains */
|
|
child->ptrace |= flags;
|
|
dkprintf("%s: (PT_TRACED%s%s%s%s%s%s)\n",
|
|
__func__,
|
|
flags & PTRACE_O_TRACESYSGOOD ? "|PTRACE_O_TRACESYSGOOD" : "",
|
|
flags & PTRACE_O_TRACEFORK ? "|PTRACE_O_TRACEFORK" : "",
|
|
flags & PTRACE_O_TRACEVFORK ? "|PTRACE_O_TRACEVFORK" : "",
|
|
flags & PTRACE_O_TRACECLONE ? "|PTRACE_O_TRACECLONE" : "",
|
|
flags & PTRACE_O_TRACEEXEC ? "|PTRACE_O_TRACEEXEC" : "",
|
|
flags & PTRACE_O_TRACEVFORKDONE ? "|PTRACE_O_TRACEVFORKDONE" : "",
|
|
flags & PTRACE_O_TRACEEXIT ? "|PTRACE_O_TRACEEXIT" : "");
|
|
|
|
ret = 0;
|
|
|
|
unlockout:
|
|
if(child)
|
|
thread_unlock(child);
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
static int ptrace_attach(int pid)
|
|
{
|
|
int error = 0;
|
|
struct thread *thread;
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct process *proc = mythread->proc;
|
|
struct siginfo info;
|
|
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
error = -ESRCH;
|
|
goto out;
|
|
}
|
|
|
|
if (proc->pid == pid) {
|
|
thread_unlock(thread);
|
|
error = -EPERM;
|
|
goto out;
|
|
}
|
|
|
|
if ((thread->ptrace & PT_TRACED) ||
|
|
thread->proc == proc) {
|
|
thread_unlock(thread);
|
|
error = -EPERM;
|
|
goto out;
|
|
}
|
|
|
|
thread->ptrace = PT_TRACED | PT_TRACE_EXEC;
|
|
error = ptrace_attach_thread(thread, proc);
|
|
|
|
thread_unlock(thread);
|
|
|
|
memset(&info, '\0', sizeof info);
|
|
info.si_signo = SIGSTOP;
|
|
info.si_code = SI_USER;
|
|
info._sifields._kill.si_pid = proc->pid;
|
|
error = do_kill(mythread, -1, pid, SIGSTOP, &info, 2);
|
|
|
|
out:
|
|
dkprintf("ptrace_attach,returning,error=%d\n", error);
|
|
return error;
|
|
}
|
|
|
|
|
|
int ptrace_detach(int pid, int data)
|
|
{
|
|
int error = 0;
|
|
struct thread *thread;
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct process *proc = mythread->proc;;
|
|
|
|
if (data > 64 || data < 0) {
|
|
return -EIO;
|
|
}
|
|
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
error = -ESRCH;
|
|
goto out;
|
|
}
|
|
|
|
if (!(thread->ptrace & PT_TRACED) || thread->report_proc != proc) {
|
|
thread_unlock(thread);
|
|
error = -ESRCH;
|
|
goto out;
|
|
}
|
|
|
|
ptrace_detach_thread(thread, data);
|
|
|
|
thread_unlock(thread);
|
|
out:
|
|
return error;
|
|
}
|
|
|
|
static long ptrace_geteventmsg(int pid, long data)
|
|
{
|
|
unsigned long *msg_p = (unsigned long *)data;
|
|
long rc = -ESRCH;
|
|
struct thread *child;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child) {
|
|
return -ESRCH;
|
|
}
|
|
if(child->status & (PS_STOPPED | PS_TRACED)){
|
|
if (copy_to_user(msg_p, &child->ptrace_eventmsg,
|
|
sizeof(*msg_p))) {
|
|
rc = -EFAULT;
|
|
}
|
|
else {
|
|
rc = 0;
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
|
|
return rc;
|
|
}
|
|
|
|
static long
|
|
ptrace_getsiginfo(int pid, siginfo_t *data)
|
|
{
|
|
struct thread *child;
|
|
int rc = 0;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child) {
|
|
return -ESRCH;
|
|
}
|
|
|
|
if(!(child->status & (PS_STOPPED | PS_TRACED))){
|
|
rc = -ESRCH;
|
|
}
|
|
else if (child->ptrace_recvsig) {
|
|
if (copy_to_user(data, &child->ptrace_recvsig->info, sizeof(siginfo_t))) {
|
|
rc = -EFAULT;
|
|
}
|
|
}
|
|
else {
|
|
rc = -ESRCH;
|
|
}
|
|
thread_unlock(child);
|
|
return rc;
|
|
}
|
|
|
|
static long
|
|
ptrace_setsiginfo(int pid, siginfo_t *data)
|
|
{
|
|
struct thread *child;
|
|
int rc = 0;
|
|
|
|
child = find_thread(0, pid);
|
|
if (!child) {
|
|
return -ESRCH;
|
|
}
|
|
|
|
if(!(child->status & (PS_STOPPED | PS_TRACED))){
|
|
rc = -ESRCH;
|
|
}
|
|
else {
|
|
if (child->ptrace_sendsig == NULL) {
|
|
child->ptrace_sendsig = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
|
|
if (child->ptrace_sendsig == NULL) {
|
|
rc = -ENOMEM;
|
|
}
|
|
}
|
|
|
|
if (!rc &&
|
|
copy_from_user(&child->ptrace_sendsig->info, data, sizeof(siginfo_t))) {
|
|
rc = -EFAULT;
|
|
}
|
|
if (!rc &&
|
|
child->ptrace_recvsig){
|
|
if(copy_from_user(&child->ptrace_recvsig->info, data, sizeof(siginfo_t))) {
|
|
rc = -EFAULT;
|
|
}
|
|
}
|
|
}
|
|
thread_unlock(child);
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(ptrace)
|
|
{
|
|
const long request = (long)ihk_mc_syscall_arg0(ctx);
|
|
const int pid = (int)ihk_mc_syscall_arg1(ctx);
|
|
const long addr = (long)ihk_mc_syscall_arg2(ctx);
|
|
const long data = (long)ihk_mc_syscall_arg3(ctx);
|
|
long error = -EOPNOTSUPP;
|
|
|
|
switch(request) {
|
|
case PTRACE_TRACEME:
|
|
dkprintf("ptrace: PTRACE_TRACEME\n");
|
|
error = ptrace_traceme();
|
|
break;
|
|
case PTRACE_KILL:
|
|
dkprintf("ptrace: PTRACE_KILL\n");
|
|
error = ptrace_wakeup_sig(pid, request, data);
|
|
break;
|
|
case PTRACE_CONT:
|
|
dkprintf("ptrace: PTRACE_CONT: data=%d\n", data);
|
|
error = ptrace_wakeup_sig(pid, request, data);
|
|
break;
|
|
case PTRACE_GETREGS:
|
|
error = ptrace_getregs(pid, data);
|
|
dkprintf("PTRACE_GETREGS: data=%p return=%p\n", data, error);
|
|
break;
|
|
case PTRACE_PEEKUSER:
|
|
error = ptrace_peekuser(pid, addr, data);
|
|
dkprintf("PTRACE_PEEKUSER: addr=%p return=%p\n", addr, error);
|
|
break;
|
|
case PTRACE_POKEUSER:
|
|
error = ptrace_pokeuser(pid, addr, data);
|
|
dkprintf("PTRACE_POKEUSER: addr=%p data=%p return=%p\n", addr, data, error);
|
|
break;
|
|
case PTRACE_SETOPTIONS:
|
|
error = ptrace_setoptions(pid, data);
|
|
dkprintf("PTRACE_SETOPTIONS: flags=%d return=%p\n", data, error);
|
|
break;
|
|
case PTRACE_PEEKTEXT:
|
|
error = ptrace_peektext(pid, addr, data);
|
|
dkprintf("PTRACE_PEEKTEXT: addr=%p return=%p\n", addr, error);
|
|
break;
|
|
case PTRACE_PEEKDATA:
|
|
error = ptrace_peektext(pid, addr, data);
|
|
dkprintf("PTRACE_PEEKDATA: addr=%p return=%p\n", addr, error);
|
|
break;
|
|
case PTRACE_POKETEXT:
|
|
error = ptrace_poketext(pid, addr, data);
|
|
dkprintf("PTRACE_POKETEXT: addr=%p data=%p\n", addr, data);
|
|
break;
|
|
case PTRACE_POKEDATA:
|
|
error = ptrace_poketext(pid, addr, data);
|
|
dkprintf("PTRACE_POKEDATA: addr=%p data=%p\n", addr, data);
|
|
break;
|
|
case PTRACE_SINGLESTEP:
|
|
dkprintf("ptrace: PTRACE_SINGLESTEP: data=%d\n", data);
|
|
error = ptrace_wakeup_sig(pid, request, data);
|
|
break;
|
|
case PTRACE_GETFPREGS:
|
|
dkprintf("ptrace: PTRACE_GETFPREGS: data=%p\n", data);
|
|
error = ptrace_getfpregs(pid, data);
|
|
break;
|
|
case PTRACE_SETFPREGS:
|
|
dkprintf("ptrace: PTRACE_SETFPREGS: data=%p\n", data);
|
|
error = ptrace_setfpregs(pid, data);
|
|
break;
|
|
case PTRACE_SETREGS:
|
|
error = ptrace_setregs(pid, data);
|
|
dkprintf("PTRACE_SETREGS: data=%p return=%p\n", data, error);
|
|
break;
|
|
case PTRACE_ATTACH:
|
|
dkprintf("ptrace: PTRACE_ATTACH: pid=%d\n", pid);
|
|
error = ptrace_attach(pid);
|
|
break;
|
|
case PTRACE_DETACH:
|
|
dkprintf("ptrace: PTRACE_DETACH: data=%d\n", data);
|
|
error = ptrace_detach(pid, data);
|
|
break;
|
|
case PTRACE_SYSCALL:
|
|
dkprintf("ptrace: PTRACE_SYSCALL: data=%d\n", data);
|
|
error = ptrace_wakeup_sig(pid, request, data);
|
|
break;
|
|
case PTRACE_GETSIGINFO:
|
|
dkprintf("ptrace: PTRACE_GETSIGINFO: data=%p\n", data);
|
|
error = ptrace_getsiginfo(pid, (siginfo_t *)data);
|
|
break;
|
|
case PTRACE_SETSIGINFO:
|
|
dkprintf("ptrace: PTRACE_SETSIGINFO: data=%p\n", data);
|
|
error = ptrace_setsiginfo(pid, (siginfo_t *)data);
|
|
break;
|
|
case PTRACE_GETREGSET:
|
|
dkprintf("ptrace: PTRACE_GETREGSET: addr=0x%x, data=%p\n", addr, data);
|
|
error = ptrace_getregset(pid, addr, data);
|
|
break;
|
|
case PTRACE_SETREGSET:
|
|
dkprintf("ptrace: PTRACE_SETREGSET: addr=0x%x, data=%p\n", addr, data);
|
|
error = ptrace_setregset(pid, addr, data);
|
|
break;
|
|
case PTRACE_GETEVENTMSG:
|
|
dkprintf("ptrace: PTRACE_GETEVENTMSG: data=%p\n", data);
|
|
error = ptrace_geteventmsg(pid, data);
|
|
break;
|
|
default:
|
|
error = arch_ptrace(request, pid, addr, data);
|
|
break;
|
|
}
|
|
|
|
dkprintf("ptrace(%d,%ld,%p,%p): returning %d\n", request, pid, addr, data, error);
|
|
return error;
|
|
}
|
|
|
|
/* We do not have actual scheduling classes so we just make sure we store
|
|
* policies and priorities in a POSIX/Linux complaint manner */
|
|
static int setscheduler(struct thread *thread, int policy, struct sched_param *param)
|
|
{
|
|
if ((policy == SCHED_FIFO || policy == SCHED_RR) &&
|
|
((param->sched_priority < 1) ||
|
|
(param->sched_priority > MAX_USER_RT_PRIO - 1))) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if ((policy == SCHED_NORMAL || policy == SCHED_BATCH || policy == SCHED_IDLE) &&
|
|
(param->sched_priority != 0)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
memcpy(&thread->sched_param, param, sizeof(*param));
|
|
thread->sched_policy = policy;
|
|
|
|
return 0;
|
|
}
|
|
|
|
#define SCHED_CHECK_SAME_OWNER 0x01
|
|
#define SCHED_CHECK_ROOT 0x02
|
|
|
|
SYSCALL_DECLARE(sched_setparam)
|
|
{
|
|
int retval = 0;
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct sched_param *uparam = (struct sched_param *)ihk_mc_syscall_arg1(ctx);
|
|
struct sched_param param;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct syscall_request request1 IHK_DMA_ALIGN;
|
|
int other_thread = 0;
|
|
|
|
dkprintf("sched_setparam: pid: %d, uparam: 0x%lx\n", pid, uparam);
|
|
|
|
if (!uparam || pid < 0) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (pid == 0)
|
|
pid = thread->proc->pid;
|
|
|
|
if (thread->proc->pid != pid) {
|
|
other_thread = 1;
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
thread_unlock(thread);
|
|
|
|
/* Ask Linux about ownership.. */
|
|
request1.number = __NR_sched_setparam;
|
|
request1.args[0] = SCHED_CHECK_SAME_OWNER;
|
|
request1.args[1] = pid;
|
|
|
|
retval = do_syscall(&request1, ihk_mc_get_processor_id());
|
|
if (retval != 0) {
|
|
return retval;
|
|
}
|
|
}
|
|
|
|
retval = copy_from_user(¶m, uparam, sizeof(param));
|
|
if (retval < 0) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (other_thread) {
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
}
|
|
retval = setscheduler(thread, thread->sched_policy, ¶m);
|
|
if (other_thread) {
|
|
thread_unlock(thread);
|
|
}
|
|
return retval;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_getparam)
|
|
{
|
|
int retval = 0;
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct sched_param *param = (struct sched_param *)ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
if (!param || pid < 0) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (pid == 0)
|
|
pid = thread->proc->pid;
|
|
|
|
if (thread->proc->pid != pid) {
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
thread_unlock(thread);
|
|
}
|
|
|
|
retval = copy_to_user(param, &thread->sched_param, sizeof(*param)) ? -EFAULT : 0;
|
|
|
|
return retval;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_setscheduler)
|
|
{
|
|
int retval;
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
int policy = ihk_mc_syscall_arg1(ctx);
|
|
struct sched_param *uparam = (struct sched_param *)ihk_mc_syscall_arg2(ctx);
|
|
struct sched_param param;
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
struct syscall_request request1 IHK_DMA_ALIGN;
|
|
|
|
if (!uparam || pid < 0) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (policy != SCHED_DEADLINE &&
|
|
policy != SCHED_FIFO && policy != SCHED_RR &&
|
|
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
|
|
policy != SCHED_IDLE) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (policy != SCHED_NORMAL) {
|
|
|
|
/* Ask Linux about permissions */
|
|
request1.number = __NR_sched_setparam;
|
|
request1.args[0] = SCHED_CHECK_ROOT;
|
|
|
|
retval = do_syscall(&request1, ihk_mc_get_processor_id());
|
|
if (retval != 0) {
|
|
return retval;
|
|
}
|
|
}
|
|
|
|
retval = copy_from_user(¶m, uparam, sizeof(param));
|
|
if (retval < 0) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (pid == 0)
|
|
pid = thread->proc->pid;
|
|
|
|
if (thread->proc->pid != pid) {
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
thread_unlock(thread);
|
|
|
|
/* Ask Linux about ownership.. */
|
|
request1.number = __NR_sched_setparam;
|
|
request1.args[0] = SCHED_CHECK_SAME_OWNER;
|
|
request1.args[1] = pid;
|
|
|
|
retval = do_syscall(&request1, ihk_mc_get_processor_id());
|
|
if (retval != 0) {
|
|
return retval;
|
|
}
|
|
}
|
|
|
|
return setscheduler(thread, policy, ¶m);
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_getscheduler)
|
|
{
|
|
int pid = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
if (pid < 0) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (pid == 0)
|
|
pid = thread->proc->pid;
|
|
|
|
if (thread->proc->pid != pid) {
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
thread_unlock(thread);
|
|
}
|
|
|
|
return thread->sched_policy;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_get_priority_max)
|
|
{
|
|
int ret = -EINVAL;
|
|
int policy = ihk_mc_syscall_arg0(ctx);
|
|
|
|
switch (policy) {
|
|
case SCHED_FIFO:
|
|
case SCHED_RR:
|
|
ret = MAX_USER_RT_PRIO - 1;
|
|
break;
|
|
case SCHED_DEADLINE:
|
|
case SCHED_NORMAL:
|
|
case SCHED_BATCH:
|
|
case SCHED_IDLE:
|
|
ret = 0;
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_get_priority_min)
|
|
{
|
|
int ret = -EINVAL;
|
|
int policy = ihk_mc_syscall_arg0(ctx);
|
|
|
|
switch (policy) {
|
|
case SCHED_FIFO:
|
|
case SCHED_RR:
|
|
ret = 1;
|
|
break;
|
|
case SCHED_DEADLINE:
|
|
case SCHED_NORMAL:
|
|
case SCHED_BATCH:
|
|
case SCHED_IDLE:
|
|
ret = 0;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(sched_rr_get_interval)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
struct timespec *utime = (struct timespec *)ihk_mc_syscall_arg1(ctx);
|
|
struct timespec t;
|
|
struct thread *thread = cpu_local_var(current);
|
|
int retval = 0;
|
|
|
|
if (pid < 0)
|
|
return -EINVAL;
|
|
|
|
if (pid == 0)
|
|
pid = thread->proc->pid;
|
|
|
|
if (thread->proc->pid != pid) {
|
|
thread = find_thread(0, pid);
|
|
if (!thread) {
|
|
return -ESRCH;
|
|
}
|
|
thread_unlock(thread);
|
|
}
|
|
|
|
t.tv_sec = 0;
|
|
t.tv_nsec = 0;
|
|
if (thread->sched_policy == SCHED_RR) {
|
|
t.tv_nsec = 10000;
|
|
}
|
|
|
|
retval = copy_to_user(utime, &t, sizeof(t)) ? -EFAULT : 0;
|
|
|
|
return retval;
|
|
}
|
|
|
|
#define MIN2(x,y) (x) < (y) ? (x) : (y)
|
|
SYSCALL_DECLARE(sched_setaffinity)
|
|
{
|
|
int tid = (int)ihk_mc_syscall_arg0(ctx);
|
|
size_t len = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
cpu_set_t *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx);
|
|
cpu_set_t k_cpu_set, cpu_set;
|
|
struct thread *thread;
|
|
int cpu_id;
|
|
int empty_set = 1;
|
|
|
|
if (!u_cpu_set) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (sizeof(k_cpu_set) > len) {
|
|
memset(&k_cpu_set, 0, sizeof(k_cpu_set));
|
|
}
|
|
|
|
len = MIN2(len, sizeof(k_cpu_set));
|
|
|
|
if (copy_from_user(&k_cpu_set, u_cpu_set, len)) {
|
|
dkprintf("%s: error: copy_from_user failed for %p:%d\n",
|
|
__FUNCTION__, u_cpu_set, len);
|
|
return -EFAULT;
|
|
}
|
|
|
|
/* Find thread */
|
|
if (tid == 0) {
|
|
tid = cpu_local_var(current)->tid;
|
|
thread = cpu_local_var(current);
|
|
cpu_id = ihk_mc_get_processor_id();
|
|
hold_thread(thread);
|
|
}
|
|
else {
|
|
struct thread *mythread = cpu_local_var(current);
|
|
|
|
thread = find_thread(0, tid);
|
|
|
|
if (!thread)
|
|
return -ESRCH;
|
|
|
|
if (mythread->proc->euid != 0 &&
|
|
mythread->proc->euid != thread->proc->ruid &&
|
|
mythread->proc->euid != thread->proc->euid) {
|
|
thread_unlock(thread);
|
|
return -EPERM;
|
|
}
|
|
|
|
hold_thread(thread);
|
|
thread_unlock(thread);
|
|
cpu_id = thread->cpu_id;
|
|
}
|
|
|
|
/* Only allow cores that are also in process' cpu_set */
|
|
CPU_ZERO(&cpu_set);
|
|
for (cpu_id = 0; cpu_id < num_processors; cpu_id++) {
|
|
if (CPU_ISSET(cpu_id, &k_cpu_set) &&
|
|
CPU_ISSET(cpu_id, &thread->proc->cpu_set)) {
|
|
CPU_SET(cpu_id, &cpu_set);
|
|
dkprintf("sched_setaffinity(): tid %d: setting target core %d\n",
|
|
cpu_local_var(current)->tid, cpu_id);
|
|
empty_set = 0;
|
|
}
|
|
}
|
|
|
|
/* Empty target set? */
|
|
if (empty_set) {
|
|
release_thread(thread);
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* Update new affinity mask */
|
|
memcpy(&thread->cpu_set, &cpu_set, sizeof(cpu_set));
|
|
|
|
/* Current core not part of new mask? */
|
|
cpu_id = thread->cpu_id;
|
|
if (!CPU_ISSET(cpu_id, &thread->cpu_set)) {
|
|
dkprintf("sched_setaffinity(): tid %d sched_request_migrate: %d\n",
|
|
cpu_local_var(current)->tid, cpu_id);
|
|
sched_request_migrate(cpu_id, thread);
|
|
}
|
|
release_thread(thread);
|
|
return 0;
|
|
}
|
|
|
|
// see linux-2.6.34.13/kernel/sched.c
|
|
SYSCALL_DECLARE(sched_getaffinity)
|
|
{
|
|
int tid = (int)ihk_mc_syscall_arg0(ctx);
|
|
size_t len = (size_t)ihk_mc_syscall_arg1(ctx);
|
|
cpu_set_t k_cpu_set, *u_cpu_set = (cpu_set_t *)ihk_mc_syscall_arg2(ctx);
|
|
struct thread *thread;
|
|
int ret;
|
|
|
|
dkprintf("%s() len: %d, mask: %p\n", __FUNCTION__, len, u_cpu_set);
|
|
if (len * 8 < num_processors) {
|
|
dkprintf("%s: Too small buffer.\n", __func__);
|
|
return -EINVAL;
|
|
}
|
|
if (len & (sizeof(unsigned long)-1)) {
|
|
dkprintf("%s: Size not align to unsigned long.\n", __func__);
|
|
return -EINVAL;
|
|
}
|
|
|
|
len = MIN2(len, sizeof(k_cpu_set));
|
|
|
|
if(tid == 0){
|
|
thread = cpu_local_var(current);
|
|
hold_thread(thread);
|
|
}
|
|
else{
|
|
struct thread *mythread = cpu_local_var(current);
|
|
|
|
thread = find_thread(0, tid);
|
|
if(!thread)
|
|
return -ESRCH;
|
|
if(mythread->proc->euid != 0 &&
|
|
mythread->proc->euid != thread->proc->ruid &&
|
|
mythread->proc->euid != thread->proc->euid){
|
|
thread_unlock(thread);
|
|
return -EPERM;
|
|
}
|
|
hold_thread(thread);
|
|
thread_unlock(thread);
|
|
}
|
|
|
|
ret = copy_to_user(u_cpu_set, &thread->cpu_set, len);
|
|
release_thread(thread);
|
|
if (ret < 0) {
|
|
ret = -EFAULT;
|
|
}
|
|
else {
|
|
ret = len;
|
|
}
|
|
|
|
dkprintf("%s() len: %d, ret: %d\n", __FUNCTION__, len, ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(get_cpu_id)
|
|
{
|
|
return ihk_mc_get_processor_id();
|
|
}
|
|
|
|
SYSCALL_DECLARE(setitimer)
|
|
{
|
|
int which = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct itimerval *new = (struct itimerval *)ihk_mc_syscall_arg1(ctx);
|
|
struct itimerval *old = (struct itimerval *)ihk_mc_syscall_arg2(ctx);
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct thread *thread = cpu_local_var(current);
|
|
int timer_start = 1;
|
|
struct itimerval wkval;
|
|
struct timeval tv;
|
|
|
|
if(which != ITIMER_REAL &&
|
|
which != ITIMER_VIRTUAL &&
|
|
which != ITIMER_PROF)
|
|
return -EINVAL;
|
|
|
|
if(which == ITIMER_REAL){
|
|
request.number = __NR_setitimer;
|
|
request.args[0] = ihk_mc_syscall_arg0(ctx);
|
|
request.args[1] = ihk_mc_syscall_arg1(ctx);
|
|
request.args[2] = ihk_mc_syscall_arg2(ctx);
|
|
|
|
return do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
else if(which == ITIMER_VIRTUAL){
|
|
if(old){
|
|
memcpy(&wkval, &thread->itimer_virtual, sizeof wkval);
|
|
if(wkval.it_value.tv_sec != 0 ||
|
|
wkval.it_value.tv_usec != 0){
|
|
ts_to_tv(&tv, &thread->itimer_virtual_value);
|
|
tv_sub(&wkval.it_value, &tv);
|
|
}
|
|
if(copy_to_user(old, &wkval, sizeof wkval))
|
|
return -EFAULT;
|
|
}
|
|
if(!new){
|
|
return 0;
|
|
}
|
|
if (copy_from_user(&thread->itimer_virtual, new, sizeof(struct itimerval))) {
|
|
return -EFAULT;
|
|
}
|
|
thread->itimer_virtual_value.tv_sec = 0;
|
|
thread->itimer_virtual_value.tv_nsec = 0;
|
|
if(thread->itimer_virtual.it_value.tv_sec == 0 &&
|
|
thread->itimer_virtual.it_value.tv_usec == 0)
|
|
timer_start = 0;
|
|
}
|
|
else if(which == ITIMER_PROF){
|
|
if(old){
|
|
memcpy(&wkval, &thread->itimer_prof, sizeof wkval);
|
|
if(wkval.it_value.tv_sec != 0 ||
|
|
wkval.it_value.tv_usec != 0){
|
|
ts_to_tv(&tv, &thread->itimer_prof_value);
|
|
tv_sub(&wkval.it_value, &tv);
|
|
}
|
|
if(copy_to_user(old, &wkval, sizeof wkval))
|
|
return -EFAULT;
|
|
}
|
|
if(!new){
|
|
return 0;
|
|
}
|
|
if (copy_from_user(&thread->itimer_prof, new, sizeof(struct itimerval))) {
|
|
return -EFAULT;
|
|
}
|
|
thread->itimer_prof_value.tv_sec = 0;
|
|
thread->itimer_prof_value.tv_nsec = 0;
|
|
if(thread->itimer_prof.it_value.tv_sec == 0 &&
|
|
thread->itimer_prof.it_value.tv_usec == 0)
|
|
timer_start = 0;
|
|
}
|
|
thread->itimer_enabled = timer_start;
|
|
set_timer(0);
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(getitimer)
|
|
{
|
|
int which = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct itimerval *old = (struct itimerval *)ihk_mc_syscall_arg1(ctx);
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct itimerval wkval;
|
|
struct timeval tv;
|
|
|
|
if(which != ITIMER_REAL &&
|
|
which != ITIMER_VIRTUAL &&
|
|
which != ITIMER_PROF)
|
|
return -EINVAL;
|
|
|
|
if(which == ITIMER_REAL){
|
|
request.number = __NR_getitimer;
|
|
request.args[0] = ihk_mc_syscall_arg0(ctx);
|
|
request.args[1] = ihk_mc_syscall_arg1(ctx);
|
|
|
|
return do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
else if(which == ITIMER_VIRTUAL){
|
|
if(old){
|
|
memcpy(&wkval, &thread->itimer_virtual, sizeof wkval);
|
|
if(wkval.it_value.tv_sec != 0 ||
|
|
wkval.it_value.tv_usec != 0){
|
|
ts_to_tv(&tv, &thread->itimer_virtual_value);
|
|
tv_sub(&wkval.it_value, &tv);
|
|
}
|
|
if(copy_to_user(old, &wkval, sizeof wkval))
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
else if(which == ITIMER_PROF){
|
|
if(old){
|
|
memcpy(&wkval, &thread->itimer_prof, sizeof wkval);
|
|
if(wkval.it_value.tv_sec != 0 ||
|
|
wkval.it_value.tv_usec != 0){
|
|
ts_to_tv(&tv, &thread->itimer_prof_value);
|
|
tv_sub(&wkval.it_value, &tv);
|
|
}
|
|
if(copy_to_user(old, &wkval, sizeof wkval))
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(clock_gettime)
|
|
{
|
|
/* TODO: handle clock_id */
|
|
struct timespec *ts = (struct timespec *)ihk_mc_syscall_arg1(ctx);
|
|
int clock_id = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
int error;
|
|
struct timespec ats;
|
|
|
|
if (!ts) {
|
|
/* nothing to do */
|
|
return 0;
|
|
}
|
|
|
|
/* Do it locally if supported */
|
|
if (gettime_local_support && clock_id == CLOCK_REALTIME) {
|
|
calculate_time_from_tsc(&ats);
|
|
|
|
error = copy_to_user(ts, &ats, sizeof(ats));
|
|
|
|
dkprintf("clock_gettime(): %d\n", error);
|
|
return error;
|
|
}
|
|
else if(clock_id == CLOCK_PROCESS_CPUTIME_ID){
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
struct thread *child;
|
|
struct mcs_rwlock_node lock;
|
|
|
|
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &lock);
|
|
list_for_each_entry(child, &proc->threads_list, siblings_list){
|
|
if(child != thread &&
|
|
child->status == PS_RUNNING &&
|
|
!child->in_kernel){
|
|
child->times_update = 0;
|
|
ihk_mc_interrupt_cpu(child->cpu_id,
|
|
ihk_mc_get_vector(IHK_GV_IKC));
|
|
}
|
|
}
|
|
ats.tv_sec = proc->utime.tv_sec;
|
|
ats.tv_nsec = proc->utime.tv_nsec;
|
|
ts_add(&ats, &proc->stime);
|
|
list_for_each_entry(child, &proc->threads_list, siblings_list){
|
|
struct timespec wts;
|
|
while(!child->times_update)
|
|
cpu_pause();
|
|
tsc_to_ts(child->user_tsc + child->system_tsc, &wts);
|
|
ts_add(&ats, &wts);
|
|
}
|
|
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &lock);
|
|
return copy_to_user(ts, &ats, sizeof ats);
|
|
}
|
|
else if(clock_id == CLOCK_THREAD_CPUTIME_ID){
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
tsc_to_ts(thread->user_tsc + thread->system_tsc, &ats);
|
|
return copy_to_user(ts, &ats, sizeof ats);
|
|
}
|
|
|
|
/* Otherwise offload */
|
|
request.number = __NR_clock_gettime;
|
|
request.args[0] = ihk_mc_syscall_arg0(ctx);
|
|
request.args[1] = ihk_mc_syscall_arg1(ctx);
|
|
|
|
return do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
|
|
SYSCALL_DECLARE(gettimeofday)
|
|
{
|
|
struct timeval *tv = (struct timeval *)ihk_mc_syscall_arg0(ctx);
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct timezone *tz = (struct timezone *)ihk_mc_syscall_arg1(ctx);
|
|
struct timeval atv;
|
|
int error;
|
|
struct timespec ats;
|
|
|
|
if (!tv && !tz) {
|
|
/* nothing to do */
|
|
return 0;
|
|
}
|
|
|
|
/* Do it locally if supported */
|
|
if (!tz && gettime_local_support) {
|
|
calculate_time_from_tsc(&ats);
|
|
|
|
atv.tv_sec = ats.tv_sec;
|
|
atv.tv_usec = ats.tv_nsec / 1000;
|
|
|
|
error = copy_to_user(tv, &atv, sizeof(atv));
|
|
|
|
dkprintf("gettimeofday(): %d\n", error);
|
|
return error;
|
|
}
|
|
|
|
/* Otherwise offload */
|
|
request.number = __NR_gettimeofday;
|
|
request.args[0] = (unsigned long)tv;
|
|
request.args[1] = (unsigned long)tz;
|
|
|
|
return do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
|
|
SYSCALL_DECLARE(settimeofday)
|
|
{
|
|
long error;
|
|
struct timeval * const utv = (void *)ihk_mc_syscall_arg0(ctx);
|
|
struct timezone * const utz = (void *)ihk_mc_syscall_arg1(ctx);
|
|
struct timeval tv;
|
|
struct timespec newts;
|
|
unsigned long tsc;
|
|
|
|
dkprintf("sys_settimeofday(%p,%p)\n", utv, utz);
|
|
ihk_mc_spinlock_lock_noirq(&tod_data_lock);
|
|
if (ihk_atomic64_read(&tod_data.version) & 1) {
|
|
panic("settimeofday");
|
|
}
|
|
|
|
if (utv && gettime_local_support) {
|
|
if (copy_from_user(&tv, utv, sizeof(tv))) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
newts.tv_sec = tv.tv_sec;
|
|
newts.tv_nsec = (long)tv.tv_usec * 1000;
|
|
|
|
tsc = rdtsc();
|
|
newts.tv_sec -= tsc / tod_data.clocks_per_sec;
|
|
newts.tv_nsec -= NS_PER_SEC * (tsc % tod_data.clocks_per_sec)
|
|
/ tod_data.clocks_per_sec;
|
|
if (newts.tv_nsec < 0) {
|
|
--newts.tv_sec;
|
|
newts.tv_nsec += NS_PER_SEC;
|
|
}
|
|
}
|
|
|
|
error = syscall_generic_forwarding(n, ctx);
|
|
|
|
if (!error && utv && gettime_local_support) {
|
|
dkprintf("sys_settimeofday(%p,%p):origin <-- %ld.%ld\n",
|
|
utv, utz, newts.tv_sec, newts.tv_nsec);
|
|
ihk_atomic64_inc(&tod_data.version);
|
|
wmb();
|
|
tod_data.origin = newts;
|
|
wmb();
|
|
ihk_atomic64_inc(&tod_data.version);
|
|
}
|
|
|
|
out:
|
|
ihk_mc_spinlock_unlock_noirq(&tod_data_lock);
|
|
dkprintf("sys_settimeofday(%p,%p): %ld\n", utv, utz, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(nanosleep)
|
|
{
|
|
struct timespec *tv = (struct timespec *)ihk_mc_syscall_arg0(ctx);
|
|
struct timespec *rem = (struct timespec *)ihk_mc_syscall_arg1(ctx);
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
struct ihk_os_cpu_monitor *monitor = cpu_local_var(monitor);
|
|
|
|
monitor->status = IHK_OS_MONITOR_KERNEL_HEAVY;
|
|
|
|
/* Do it locally if supported */
|
|
if (gettime_local_support) {
|
|
unsigned long nanosecs;
|
|
unsigned long nanosecs_rem;
|
|
unsigned long tscs;
|
|
long tscs_rem;
|
|
struct timespec _tv;
|
|
struct timespec _rem;
|
|
int ret = 0;
|
|
|
|
unsigned long ts = rdtsc();
|
|
|
|
if (copy_from_user(&_tv, tv, sizeof(*tv))) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (_tv.tv_sec < 0 || _tv.tv_nsec >= NS_PER_SEC) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
nanosecs = _tv.tv_sec * NS_PER_SEC + _tv.tv_nsec;
|
|
tscs = nanosecs * 1000 / ihk_mc_get_ns_per_tsc();
|
|
|
|
/* Spin wait */
|
|
while (rdtsc() - ts < tscs) {
|
|
if (hassigpending(cpu_local_var(current))) {
|
|
ret = -EINTR;
|
|
break;
|
|
}
|
|
cpu_pause();
|
|
}
|
|
|
|
if ((ret == -EINTR) && rem) {
|
|
tscs_rem = tscs - (rdtsc() - ts);
|
|
if (tscs_rem < 0) {
|
|
tscs_rem = 0;
|
|
}
|
|
nanosecs_rem = tscs_rem * ihk_mc_get_ns_per_tsc() / 1000;
|
|
|
|
_rem.tv_sec = nanosecs_rem / NS_PER_SEC;
|
|
_rem.tv_nsec = nanosecs_rem % NS_PER_SEC;
|
|
|
|
if (copy_to_user(rem, &_rem, sizeof(*rem))) {
|
|
ret = -EFAULT;
|
|
}
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
/* Otherwise offload */
|
|
request.number = __NR_nanosleep;
|
|
request.args[0] = (unsigned long)tv;
|
|
request.args[1] = (unsigned long)rem;
|
|
|
|
return do_syscall(&request, ihk_mc_get_processor_id());
|
|
}
|
|
|
|
//#define DISABLE_SCHED_YIELD
|
|
SYSCALL_DECLARE(sched_yield)
|
|
{
|
|
int do_schedule = 0;
|
|
long runq_irqstate;
|
|
struct cpu_local_var *v = get_this_cpu_local_var();
|
|
|
|
#ifdef DISABLE_SCHED_YIELD
|
|
return 0;
|
|
#endif
|
|
|
|
runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
|
|
|
|
if (v->flags & CPU_FLAG_NEED_RESCHED || v->runq_len > 1) {
|
|
v->flags &= ~CPU_FLAG_NEED_RESCHED;
|
|
do_schedule = 1;
|
|
}
|
|
|
|
ihk_mc_spinlock_unlock(&v->runq_lock, runq_irqstate);
|
|
|
|
if (do_schedule) {
|
|
schedule();
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(mlock)
|
|
{
|
|
const uintptr_t start0 = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
uintptr_t start;
|
|
size_t len;
|
|
uintptr_t end;
|
|
struct vm_range *first;
|
|
uintptr_t addr;
|
|
struct vm_range *range;
|
|
int error;
|
|
struct vm_range *changed;
|
|
|
|
dkprintf("[%d]sys_mlock(%lx,%lx)\n",
|
|
ihk_mc_get_processor_id(), start0, len0);
|
|
|
|
start = start0 & PAGE_MASK;
|
|
len = (start & (PAGE_SIZE - 1)) + len0;
|
|
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
|
end = start + len;
|
|
|
|
if (end < start) {
|
|
error = -EINVAL;
|
|
goto out2;
|
|
}
|
|
|
|
if ((start < region->user_start)
|
|
|| (region->user_end <= start)
|
|
|| (len > (region->user_end - region->user_start))
|
|
|| ((region->user_end - len) < start)) {
|
|
error = -ENOMEM;
|
|
goto out2;
|
|
}
|
|
|
|
if (start == end) {
|
|
error = 0;
|
|
goto out2;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
/* check contiguous map */
|
|
first = NULL;
|
|
for (addr = start; addr < end; addr = range->end) {
|
|
if (first == NULL) {
|
|
range = lookup_process_memory_range(thread->vm, start, start+PAGE_SIZE);
|
|
first = range;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, range);
|
|
}
|
|
|
|
if (!range || (addr < range->start)) {
|
|
/* not contiguous */
|
|
dkprintf("[%d]sys_mlock(%lx,%lx):not contiguous."
|
|
" %lx [%lx-%lx)\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, addr, range->start, range->end);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) {
|
|
ekprintf("[%d]sys_mlock(%lx,%lx):cannot change."
|
|
" [%lx-%lx) %lx\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, range->start, range->end,
|
|
range->flag);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* do the mlock */
|
|
changed = NULL;
|
|
for (addr = start; addr < end; addr = changed->end) {
|
|
if (!changed) {
|
|
range = first;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, changed);
|
|
}
|
|
|
|
if (!range || (addr < range->start)) {
|
|
/* not contiguous */
|
|
dkprintf("[%d]sys_mlock(%lx,%lx):not contiguous."
|
|
" %lx [%lx-%lx)\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, addr, range?range->start:0,
|
|
range?range->end:0);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (range->start < addr) {
|
|
error = split_process_memory_range(thread->vm, range, addr, &range);
|
|
if (error) {
|
|
ekprintf("[%d]sys_mlock(%lx,%lx):split failed. "
|
|
" [%lx-%lx) %lx %d\n",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, range->start,
|
|
range->end, addr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
if (end < range->end) {
|
|
error = split_process_memory_range(thread->vm, range, end, NULL);
|
|
if (error) {
|
|
ekprintf("[%d]sys_mlock(%lx,%lx):split failed. "
|
|
" [%lx-%lx) %lx %d\n",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, range->start,
|
|
range->end, addr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
range->flag |= VR_LOCKED;
|
|
|
|
if (!changed) {
|
|
changed = range;
|
|
}
|
|
else {
|
|
error = join_process_memory_range(thread->vm, changed, range);
|
|
if (error) {
|
|
dkprintf("[%d]sys_mlock(%lx,%lx):join failed. %d",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, error);
|
|
dkprintf("LHS: %p [%lx-%lx) %lx %p\n",
|
|
changed, changed->start,
|
|
changed->end, changed->flag,
|
|
changed->memobj);
|
|
dkprintf("RHS: %p [%lx-%lx) %lx %p\n",
|
|
range, range->start,
|
|
range->end, range->flag,
|
|
range->memobj);
|
|
changed = range;
|
|
/* through */
|
|
}
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
if (!error) {
|
|
error = populate_process_memory(thread->vm, (void *)start, len);
|
|
if (error) {
|
|
ekprintf("sys_mlock(%lx,%lx):populate failed. %d\n",
|
|
start0, len0, error);
|
|
/*
|
|
* In this case,
|
|
* the region locked by this call should be unlocked
|
|
* before mlock() returns with error.
|
|
*
|
|
* However, the region cannot be unlocked simply,
|
|
* because the region can be modified by other thread
|
|
* because memory_range_lock has been released.
|
|
*
|
|
* For the time being, like a linux-2.6.38-8,
|
|
* the physical page allocation failure is ignored.
|
|
*/
|
|
error = 0;
|
|
}
|
|
}
|
|
|
|
out2:
|
|
dkprintf("[%d]sys_mlock(%lx,%lx): %d\n",
|
|
ihk_mc_get_processor_id(), start0, len0, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(munlock)
|
|
{
|
|
const uintptr_t start0 = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct vm_regions *region = &thread->vm->region;
|
|
uintptr_t start;
|
|
size_t len;
|
|
uintptr_t end;
|
|
struct vm_range *first;
|
|
uintptr_t addr;
|
|
struct vm_range *range;
|
|
int error;
|
|
struct vm_range *changed;
|
|
|
|
dkprintf("[%d]sys_munlock(%lx,%lx)\n",
|
|
ihk_mc_get_processor_id(), start0, len0);
|
|
|
|
start = start0 & PAGE_MASK;
|
|
len = (start & (PAGE_SIZE - 1)) + len0;
|
|
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
|
end = start + len;
|
|
|
|
if (end < start) {
|
|
error = -EINVAL;
|
|
goto out2;
|
|
}
|
|
|
|
if ((start < region->user_start)
|
|
|| (region->user_end <= start)
|
|
|| (len > (region->user_end - region->user_start))
|
|
|| ((region->user_end - len) < start)) {
|
|
error = -ENOMEM;
|
|
goto out2;
|
|
}
|
|
|
|
if (start == end) {
|
|
error = 0;
|
|
goto out2;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
/* check contiguous map */
|
|
first = NULL;
|
|
for (addr = start; addr < end; addr = range->end) {
|
|
if (first == NULL) {
|
|
range = lookup_process_memory_range(thread->vm, start, start+PAGE_SIZE);
|
|
first = range;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, range);
|
|
}
|
|
|
|
if (!range || (addr < range->start)) {
|
|
/* not contiguous */
|
|
dkprintf("[%d]sys_munlock(%lx,%lx):not contiguous."
|
|
" %lx [%lx-%lx)\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, addr, range->start, range->end);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (range->flag & (VR_REMOTE | VR_RESERVED | VR_IO_NOCACHE)) {
|
|
ekprintf("[%d]sys_munlock(%lx,%lx):cannot change."
|
|
" [%lx-%lx) %lx\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, range->start, range->end,
|
|
range->flag);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* do the munlock */
|
|
changed = NULL;
|
|
for (addr = start; addr < end; addr = changed->end) {
|
|
if (!changed) {
|
|
range = first;
|
|
}
|
|
else {
|
|
range = next_process_memory_range(thread->vm, changed);
|
|
}
|
|
|
|
if (!range || (addr < range->start)) {
|
|
/* not contiguous */
|
|
dkprintf("[%d]sys_munlock(%lx,%lx):not contiguous."
|
|
" %lx [%lx-%lx)\n",
|
|
ihk_mc_get_processor_id(), start0,
|
|
len0, addr, range?range->start:0,
|
|
range?range->end:0);
|
|
error = -ENOMEM;
|
|
goto out;
|
|
}
|
|
|
|
if (range->start < addr) {
|
|
error = split_process_memory_range(thread->vm, range, addr, &range);
|
|
if (error) {
|
|
ekprintf("[%d]sys_munlock(%lx,%lx):split failed. "
|
|
" [%lx-%lx) %lx %d\n",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, range->start,
|
|
range->end, addr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
if (end < range->end) {
|
|
error = split_process_memory_range(thread->vm, range, end, NULL);
|
|
if (error) {
|
|
ekprintf("[%d]sys_munlock(%lx,%lx):split failed. "
|
|
" [%lx-%lx) %lx %d\n",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, range->start,
|
|
range->end, addr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
range->flag &= ~VR_LOCKED;
|
|
|
|
if (!changed) {
|
|
changed = range;
|
|
}
|
|
else {
|
|
error = join_process_memory_range(thread->vm, changed, range);
|
|
if (error) {
|
|
dkprintf("[%d]sys_munlock(%lx,%lx):join failed. %d",
|
|
ihk_mc_get_processor_id(),
|
|
start0, len0, error);
|
|
dkprintf("LHS: %p [%lx-%lx) %lx %p\n",
|
|
changed, changed->start,
|
|
changed->end, changed->flag,
|
|
changed->memobj);
|
|
dkprintf("RHS: %p [%lx-%lx) %lx %p\n",
|
|
range, range->start,
|
|
range->end, range->flag,
|
|
range->memobj);
|
|
changed = range;
|
|
/* through */
|
|
}
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
out2:
|
|
dkprintf("[%d]sys_munlock(%lx,%lx): %d\n",
|
|
ihk_mc_get_processor_id(), start0, len0, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(mlockall)
|
|
{
|
|
const int flags = ihk_mc_syscall_arg0(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process *proc = thread->proc;
|
|
|
|
if (!flags || (flags & ~(MCL_CURRENT|MCL_FUTURE))) {
|
|
kprintf("mlockall(0x%x):invalid flags: EINVAL\n", flags);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (!proc->euid) {
|
|
kprintf("mlockall(0x%x):priv user: 0\n", flags);
|
|
return 0;
|
|
}
|
|
|
|
if (proc->rlimit[MCK_RLIMIT_MEMLOCK].rlim_cur != 0) {
|
|
kprintf("mlockall(0x%x):limits exists: ENOMEM\n", flags);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
kprintf("mlockall(0x%x):no lock permitted: EPERM\n", flags);
|
|
return -EPERM;
|
|
} /* sys_mlockall() */
|
|
|
|
SYSCALL_DECLARE(munlockall)
|
|
{
|
|
kprintf("munlockall(): 0\n");
|
|
return 0;
|
|
} /* sys_munlockall() */
|
|
|
|
SYSCALL_DECLARE(remap_file_pages)
|
|
{
|
|
const uintptr_t start0 = ihk_mc_syscall_arg0(ctx);
|
|
const size_t size = ihk_mc_syscall_arg1(ctx);
|
|
const int prot = ihk_mc_syscall_arg2(ctx);
|
|
const size_t pgoff = ihk_mc_syscall_arg3(ctx);
|
|
const int flags = ihk_mc_syscall_arg4(ctx);
|
|
int error;
|
|
const uintptr_t start = start0 & PAGE_MASK;
|
|
const uintptr_t end = start + size;
|
|
const off_t off = (off_t)pgoff << PAGE_SHIFT;
|
|
struct thread * const thread = cpu_local_var(current);
|
|
struct vm_range *range;
|
|
int er;
|
|
int need_populate = 0;
|
|
|
|
dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x)\n",
|
|
start0, size, prot, pgoff, flags);
|
|
ihk_rwspinlock_write_lock_noirq(&thread->vm->memory_range_lock);
|
|
#define PGOFF_LIMIT ((off_t)1 << ((8*sizeof(off_t) - 1) - PAGE_SHIFT))
|
|
if ((size <= 0) || (size & (PAGE_SIZE - 1)) || (prot != 0)
|
|
|| (PGOFF_LIMIT <= pgoff)
|
|
|| ((PGOFF_LIMIT - pgoff) < (size / PAGE_SIZE))
|
|
|| !((start < end) || (end == 0))) {
|
|
ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):"
|
|
"invalid args\n",
|
|
start0, size, prot, pgoff, flags);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
range = lookup_process_memory_range(thread->vm, start, end);
|
|
if (!range || (start < range->start) || (range->end < end)
|
|
|| (range->flag & VR_PRIVATE)
|
|
|| (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED))
|
|
|| !is_callable_remap_file_pages(range->memobj)) {
|
|
ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):"
|
|
"invalid VMR:[%#lx-%#lx) %#lx %p\n",
|
|
start0, size, prot, pgoff, flags,
|
|
range?range->start:0, range?range->end:0,
|
|
range?range->flag:0, range?range->memobj:NULL);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
flush_nfo_tlb();
|
|
|
|
range->flag |= VR_FILEOFF;
|
|
error = remap_process_memory_range(thread->vm, range, start, end, off);
|
|
if (error) {
|
|
ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):"
|
|
"remap failed %d\n",
|
|
start0, size, prot, pgoff, flags, error);
|
|
goto out;
|
|
}
|
|
clear_host_pte(start, size, 1 /* memory range lock */); /* XXX: workaround */
|
|
|
|
if (range->flag & VR_LOCKED) {
|
|
need_populate = 1;
|
|
}
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_write_unlock_noirq(&thread->vm->memory_range_lock);
|
|
|
|
if (need_populate
|
|
&& (er = populate_process_memory(
|
|
thread->vm, (void *)start, size))) {
|
|
ekprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x):"
|
|
"populate failed %d\n",
|
|
start0, size, prot, pgoff, flags, er);
|
|
/* ignore populate error */
|
|
}
|
|
|
|
dkprintf("sys_remap_file_pages(%#lx,%#lx,%#x,%#lx,%#x): %d\n",
|
|
start0, size, prot, pgoff, flags, error);
|
|
return error;
|
|
}
|
|
|
|
SYSCALL_DECLARE(mremap)
|
|
{
|
|
const uintptr_t oldaddr = ihk_mc_syscall_arg0(ctx);
|
|
const size_t oldsize0 = ihk_mc_syscall_arg1(ctx);
|
|
const size_t newsize0 = ihk_mc_syscall_arg2(ctx);
|
|
const int flags = ihk_mc_syscall_arg3(ctx);
|
|
const uintptr_t newaddr = ihk_mc_syscall_arg4(ctx);
|
|
const size_t oldsize = (oldsize0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
const size_t newsize = (newsize0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
const uintptr_t oldstart = oldaddr;
|
|
const uintptr_t oldend = oldstart + oldsize;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
int error;
|
|
struct vm_range *range;
|
|
int need_relocate;
|
|
uintptr_t newstart;
|
|
uintptr_t newend;
|
|
size_t size;
|
|
uintptr_t ret;
|
|
uintptr_t lckstart = -1;
|
|
uintptr_t lckend = -1;
|
|
|
|
dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx)\n",
|
|
oldaddr, oldsize0, newsize0, flags, newaddr);
|
|
|
|
if (vm->proc->straight_va &&
|
|
(void *)oldaddr >= vm->proc->straight_va &&
|
|
(void *)oldaddr < vm->proc->straight_va + vm->proc->straight_len) {
|
|
kprintf("%s: reject for straight range 0x%lx\n",
|
|
__FUNCTION__, oldaddr);
|
|
return -EINVAL;
|
|
}
|
|
|
|
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
|
|
|
|
/* check arguments */
|
|
if ((oldaddr & ~PAGE_MASK)
|
|
|| (newsize == 0)
|
|
|| (flags & ~(MREMAP_MAYMOVE | MREMAP_FIXED))
|
|
|| ((flags & MREMAP_FIXED)
|
|
&& !(flags & MREMAP_MAYMOVE))
|
|
|| ((flags & MREMAP_FIXED)
|
|
&& (newaddr & ~PAGE_MASK))) {
|
|
error = -EINVAL;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):invalid. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags, newaddr,
|
|
error);
|
|
goto out;
|
|
}
|
|
|
|
/* check necessity of remap */
|
|
if (!(flags & MREMAP_FIXED) && oldsize == newsize) {
|
|
/* Nothing to do */
|
|
error = 0;
|
|
newstart = oldaddr;
|
|
goto out;
|
|
}
|
|
|
|
if (oldend < oldstart) {
|
|
error = -EINVAL;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"old range overflow. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
|
|
if (newsize > (vm->region.user_end - vm->region.user_start)) {
|
|
error = -ENOMEM;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"cannot allocate. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
|
|
/* check original mapping */
|
|
range = lookup_process_memory_range(vm, oldstart, oldstart+PAGE_SIZE);
|
|
if (!range || (oldstart < range->start) || (range->end < oldend)
|
|
|| (range->flag & (VR_FILEOFF))
|
|
|| (range->flag & (VR_REMOTE|VR_IO_NOCACHE|VR_RESERVED))) {
|
|
error = -EFAULT;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"lookup failed. %d %p %#lx-%#lx %#lx\n",
|
|
oldaddr, oldsize0, newsize0, flags, newaddr,
|
|
error, range, range?range->start:0,
|
|
range?range->end:0, range?range->flag:0);
|
|
goto out;
|
|
}
|
|
|
|
/* determine new mapping range */
|
|
need_relocate = 0;
|
|
if (flags & MREMAP_FIXED) {
|
|
need_relocate = 1;
|
|
newstart = newaddr;
|
|
newend = newstart + newsize;
|
|
if (newstart < vm->region.user_start) {
|
|
error = -EPERM;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"mmap_min_addr %#lx. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, vm->region.user_start,
|
|
error);
|
|
goto out;
|
|
}
|
|
if ((newstart < oldend) && (oldstart < newend)) {
|
|
error = -EINVAL;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"fixed:overlapped. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
else if (!(flags & MREMAP_FIXED) && (oldsize < newsize)) {
|
|
if (oldend == range->end) {
|
|
newstart = oldstart;
|
|
newend = newstart + newsize;
|
|
error = extend_up_process_memory_range(vm, range,
|
|
newend);
|
|
flush_nfo_tlb();
|
|
if (!error) {
|
|
if (range->flag & VR_LOCKED) {
|
|
lckstart = oldend;
|
|
lckend = newend;
|
|
}
|
|
goto out;
|
|
}
|
|
}
|
|
if (!(flags & MREMAP_MAYMOVE)) {
|
|
error = -ENOMEM;
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"cannot relocate. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error);
|
|
goto out;
|
|
}
|
|
need_relocate = 1;
|
|
error = search_free_space(newsize, range->pgshift, &newstart);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"search failed. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error);
|
|
goto out;
|
|
}
|
|
newend = newstart + newsize;
|
|
}
|
|
else {
|
|
newstart = oldstart;
|
|
newend = newstart + newsize;
|
|
}
|
|
|
|
/* do the remap */
|
|
if (need_relocate) {
|
|
if (flags & MREMAP_FIXED) {
|
|
error = do_munmap((void *)newstart, newsize, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"fixed:munmap failed. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
if (range->memobj) {
|
|
memobj_ref(range->memobj);
|
|
}
|
|
error = add_process_memory_range(thread->vm, newstart, newend, -1,
|
|
range->flag, range->memobj,
|
|
range->objoff + (oldstart - range->start),
|
|
0, NULL, NULL);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"add failed. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error);
|
|
if (range->memobj) {
|
|
memobj_unref(range->memobj);
|
|
}
|
|
goto out;
|
|
}
|
|
flush_nfo_tlb();
|
|
if (range->flag & VR_LOCKED) {
|
|
lckstart = newstart;
|
|
lckend = newend;
|
|
}
|
|
|
|
if (oldsize > 0) {
|
|
size = (oldsize < newsize)? oldsize: newsize;
|
|
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
|
|
if (range->start != oldstart) {
|
|
error = split_process_memory_range(vm,
|
|
range, oldstart, &range);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"split range failed. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
if (range->end != oldstart + size) {
|
|
error = split_process_memory_range(vm,
|
|
range, oldstart + size, NULL);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"split range failed. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
error = move_pte_range(vm->address_space->page_table, vm,
|
|
(void *)oldstart, (void *)newstart,
|
|
size, range);
|
|
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"move failed. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
|
|
error = do_munmap((void *)oldstart, oldsize, 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"relocate:munmap failed. %d\n",
|
|
oldaddr, oldsize0, newsize0,
|
|
flags, newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
else if (newsize < oldsize) {
|
|
error = do_munmap((void *)newend, (oldend - newend), 1/* holding memory_range_lock */);
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"shrink:munmap failed. %d\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error);
|
|
goto out;
|
|
}
|
|
}
|
|
else {
|
|
/* nothing to do */
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
if (!error && (lckstart < lckend)) {
|
|
error = populate_process_memory(thread->vm, (void *)lckstart, (lckend - lckstart));
|
|
if (error) {
|
|
ekprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):"
|
|
"populate failed. %d %#lx-%#lx\n",
|
|
oldaddr, oldsize0, newsize0, flags,
|
|
newaddr, error, lckstart, lckend);
|
|
error = 0; /* ignore error */
|
|
}
|
|
}
|
|
ret = (error)? error: newstart;
|
|
dkprintf("sys_mremap(%#lx,%#lx,%#lx,%#x,%#lx):%d %#lx\n",
|
|
oldaddr, oldsize0, newsize0, flags, newaddr, error,
|
|
ret);
|
|
return ret;
|
|
}
|
|
|
|
SYSCALL_DECLARE(msync)
|
|
{
|
|
const uintptr_t start0 = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len0 = ihk_mc_syscall_arg1(ctx);
|
|
const int flags = ihk_mc_syscall_arg2(ctx);
|
|
const size_t len = (len0 + PAGE_SIZE - 1) & PAGE_MASK;
|
|
const uintptr_t start = start0;
|
|
const uintptr_t end = start + len;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct process_vm *vm = thread->vm;
|
|
int error;
|
|
uintptr_t addr;
|
|
struct vm_range *range;
|
|
uintptr_t s;
|
|
uintptr_t e;
|
|
|
|
dkprintf("sys_msync(%#lx,%#lx,%#x)\n", start0, len0, flags);
|
|
ihk_rwspinlock_read_lock_noirq(&vm->memory_range_lock);
|
|
|
|
if ((start0 & ~PAGE_MASK)
|
|
|| (flags & ~(MS_ASYNC|MS_INVALIDATE|MS_SYNC))
|
|
|| ((flags & MS_ASYNC) && (flags & MS_SYNC))) {
|
|
error = -EINVAL;
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):invalid args. %d\n",
|
|
start0, len0, flags, error);
|
|
goto out;
|
|
}
|
|
if (end < start) {
|
|
error = -ENOMEM;
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):invalid args. %d\n",
|
|
start0, len0, flags, error);
|
|
goto out;
|
|
}
|
|
|
|
/* check ranges */
|
|
range = NULL;
|
|
for (addr = start; addr < end; addr = range->end) {
|
|
if (!range) {
|
|
range = lookup_process_memory_range(vm, addr,
|
|
addr+PAGE_SIZE);
|
|
}
|
|
else {
|
|
range = next_process_memory_range(vm, range);
|
|
}
|
|
|
|
if (!range || (addr < range->start)) {
|
|
error = -ENOMEM;
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):"
|
|
"invalid VMR %d %#lx-%#lx %#lx\n",
|
|
start0, len0, flags, error,
|
|
range?range->start:0,
|
|
range?range->end:0,
|
|
range?range->flag:0);
|
|
goto out;
|
|
}
|
|
if ((flags & MS_INVALIDATE) && (range->flag & VR_LOCKED)) {
|
|
error = -EBUSY;
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):"
|
|
"locked VMR %d %#lx-%#lx %#lx\n",
|
|
start0, len0, flags, error,
|
|
range->start, range->end, range->flag);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
/* do the sync */
|
|
range = NULL;
|
|
for (addr = start; addr < end; addr = range->end) {
|
|
if (!range) {
|
|
range = lookup_process_memory_range(vm, addr,
|
|
addr+PAGE_SIZE);
|
|
}
|
|
else {
|
|
range = next_process_memory_range(vm, range);
|
|
}
|
|
|
|
if ((range->flag & VR_PRIVATE) || !range->memobj
|
|
|| !memobj_has_pager(range->memobj)) {
|
|
dkprintf("sys_msync(%#lx,%#lx,%#x):unsyncable VMR %#lx-%#lx %#lx\n",
|
|
start0, len0, flags,
|
|
range->start, range->end, range->flag);
|
|
/* nothing to do */
|
|
continue;
|
|
}
|
|
|
|
s = addr;
|
|
e = (range->end < end)? range->end: end;
|
|
|
|
if (flags & (MS_ASYNC | MS_SYNC)) {
|
|
error = sync_process_memory_range(vm, range, s, e);
|
|
if (error) {
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):sync failed. %d\n",
|
|
start0, len0, flags, error);
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (flags & MS_INVALIDATE) {
|
|
error = invalidate_process_memory_range(
|
|
vm, range, s, e);
|
|
if (error) {
|
|
ekprintf("sys_msync(%#lx,%#lx,%#x):"
|
|
"invalidate failed. %d\n",
|
|
start0, len0, flags, error);
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
dkprintf("sys_msync(%#lx,%#lx,%#x):%d\n", start0, len0, flags, error);
|
|
return error;
|
|
} /* sys_msync() */
|
|
|
|
SYSCALL_DECLARE(getcpu)
|
|
{
|
|
const uintptr_t cpup = ihk_mc_syscall_arg0(ctx);
|
|
const uintptr_t nodep = ihk_mc_syscall_arg1(ctx);
|
|
const int cpu = ihk_mc_get_processor_id();
|
|
const int node = ihk_mc_get_numa_id();
|
|
int error;
|
|
|
|
if (cpup) {
|
|
error = copy_to_user((void *)cpup, &cpu, sizeof(cpu));
|
|
if (error) {
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (nodep) {
|
|
error = copy_to_user((void *)nodep, &node, sizeof(node));
|
|
if (error) {
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
error = 0;
|
|
out:
|
|
return error;
|
|
} /* sys_getcpu() */
|
|
|
|
SYSCALL_DECLARE(mbind)
|
|
{
|
|
unsigned long addr = ihk_mc_syscall_arg0(ctx);
|
|
unsigned long len = ihk_mc_syscall_arg1(ctx);
|
|
int mode = ihk_mc_syscall_arg2(ctx);
|
|
unsigned long *nodemask =
|
|
(unsigned long *)ihk_mc_syscall_arg3(ctx);
|
|
unsigned long maxnode = ihk_mc_syscall_arg4(ctx);
|
|
unsigned flags = ihk_mc_syscall_arg5(ctx);
|
|
struct process_vm *vm = cpu_local_var(current)->vm;
|
|
unsigned long nodemask_bits = 0;
|
|
int mode_flags = 0;
|
|
int error = 0;
|
|
int bit;
|
|
struct vm_range *range;
|
|
struct vm_range_numa_policy *range_policy, *range_policy_iter = NULL;
|
|
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
|
|
|
|
dkprintf("%s: addr: 0x%lx, len: %lu, mode: 0x%x, "
|
|
"nodemask: 0x%lx, flags: %lx\n",
|
|
__FUNCTION__,
|
|
addr, len, mode, nodemask, flags);
|
|
|
|
/* No bind support for straight mapped processes */
|
|
if (cpu_local_var(current)->proc->straight_va) {
|
|
return 0;
|
|
}
|
|
|
|
/* Validate arguments */
|
|
if (addr & ~PAGE_MASK) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
|
if (addr + len < addr || addr == (addr + len)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
return 0;
|
|
#endif
|
|
|
|
memset(numa_mask, 0, sizeof(numa_mask));
|
|
|
|
if (maxnode) {
|
|
nodemask_bits = ALIGN(maxnode, 8);
|
|
if (maxnode > (PAGE_SIZE << 3)) {
|
|
dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
|
|
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
|
|
__FUNCTION__);
|
|
nodemask_bits = PROCESS_NUMA_MASK_BITS;
|
|
}
|
|
}
|
|
|
|
if ((mode & MPOL_F_STATIC_NODES) && (mode & MPOL_F_RELATIVE_NODES)) {
|
|
dkprintf("%s: error: MPOL_F_STATIC_NODES & MPOL_F_RELATIVE_NODES\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if ((flags & MPOL_MF_STRICT) && (flags & MPOL_MF_MOVE)) {
|
|
dkprintf("%s: error: MPOL_MF_STRICT & MPOL_MF_MOVE\n",
|
|
__FUNCTION__);
|
|
/*
|
|
* XXX: man page claims the correct error code is EIO,
|
|
* but LTP tests for EINVAL.
|
|
*/
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
mode_flags = (mode & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES));
|
|
mode &= ~(MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
|
|
|
|
if (mode_flags & MPOL_F_RELATIVE_NODES) {
|
|
/* Not supported.. */
|
|
dkprintf("%s: error: MPOL_F_RELATIVE_NODES not supported\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
switch (mode) {
|
|
case MPOL_DEFAULT:
|
|
if (nodemask && nodemask_bits) {
|
|
error = copy_from_user(numa_mask, nodemask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
dkprintf("%s: error: copy_from_user numa_mask\n",
|
|
__FUNCTION__);
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
if (!bitmap_empty(numa_mask, nodemask_bits)) {
|
|
dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
break;
|
|
|
|
case MPOL_BIND:
|
|
case MPOL_INTERLEAVE:
|
|
case MPOL_PREFERRED:
|
|
/* Special case for MPOL_PREFERRED with empty nodemask */
|
|
if (mode == MPOL_PREFERRED && !nodemask) {
|
|
error = 0;
|
|
break;
|
|
}
|
|
|
|
if (flags & MPOL_MF_STRICT) {
|
|
error = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
error = copy_from_user(numa_mask, nodemask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
if (!nodemask || bitmap_empty(numa_mask, nodemask_bits)) {
|
|
dkprintf("%s: ERROR: nodemask not specified\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Verify NUMA mask */
|
|
for_each_set_bit(bit, numa_mask,
|
|
maxnode < PROCESS_NUMA_MASK_BITS ?
|
|
maxnode : PROCESS_NUMA_MASK_BITS) {
|
|
if (bit >= ihk_mc_get_nr_numa_nodes()) {
|
|
dkprintf("%s: %d is bigger than # of NUMA nodes\n",
|
|
__FUNCTION__, bit);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Validate address range */
|
|
ihk_rwspinlock_write_lock_noirq(&vm->memory_range_lock);
|
|
|
|
range = lookup_process_memory_range(vm, addr, addr + len);
|
|
if (!range) {
|
|
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
|
|
error = -EFAULT;
|
|
goto unlock_out;
|
|
}
|
|
|
|
/* Do the actual policy setting */
|
|
switch (mode) {
|
|
/*
|
|
* Man page claims MPOL_DEFAULT should remove any range specific
|
|
* policies so that process wise policy will be used. LTP on the
|
|
* other hand seems to test if MPOL_DEFAULT is set as a range policy.
|
|
* MPOL_DEFAULT thus behaves the same as the rest of the policies
|
|
* for now.
|
|
*/
|
|
#if 0
|
|
case MPOL_DEFAULT:
|
|
/* Delete or adjust any overlapping range settings */
|
|
list_for_each_entry_safe(range_policy_iter, range_policy_next,
|
|
&vm->vm_range_numa_policy_list, list) {
|
|
int keep = 0;
|
|
unsigned long orig_end = range_policy_iter->end;
|
|
|
|
if (range_policy_iter->end < addr ||
|
|
range_policy_iter->start > addr + len) {
|
|
continue;
|
|
}
|
|
|
|
/* Do we need to keep the front? */
|
|
if (range_policy_iter->start < addr) {
|
|
range_policy_iter->end = addr;
|
|
keep = 1;
|
|
}
|
|
|
|
/* Do we need to keep the end? */
|
|
if (orig_end > addr + len) {
|
|
/* Are we keeping front already? */
|
|
if (keep) {
|
|
/* Add a new entry after */
|
|
range_policy = kmalloc(sizeof(*range_policy),
|
|
IHK_MC_AP_NOWAIT);
|
|
if (!range_policy) {
|
|
kprintf("%s: error allocating range_policy\n",
|
|
__FUNCTION__);
|
|
error = -ENOMEM;
|
|
goto unlock_out;
|
|
}
|
|
|
|
memcpy(range_policy, range_policy_iter,
|
|
sizeof(*range_policy));
|
|
range_policy->start = addr + len;
|
|
range_policy->end = orig_end;
|
|
list_add(&range_policy->list,
|
|
&range_policy_iter->list);
|
|
}
|
|
else {
|
|
range_policy_iter->start = addr + len;
|
|
keep = 1;
|
|
}
|
|
}
|
|
|
|
if (!keep) {
|
|
list_del(&range_policy_iter->list);
|
|
kfree(range_policy_iter);
|
|
}
|
|
}
|
|
|
|
break;
|
|
#endif
|
|
case MPOL_DEFAULT:
|
|
case MPOL_BIND:
|
|
case MPOL_INTERLEAVE:
|
|
case MPOL_PREFERRED:
|
|
/* Check if same range is existing */
|
|
range_policy_iter = vm_range_policy_search(vm, addr);
|
|
if (range_policy_iter) {
|
|
if (range_policy_iter->start == addr &&
|
|
range_policy_iter->end == addr + len) {
|
|
/* same range */
|
|
range_policy = range_policy_iter;
|
|
goto mbind_update_only;
|
|
}
|
|
}
|
|
|
|
/* Clear target range */
|
|
error = vm_policy_clear_range(vm, addr, addr + len);
|
|
if (error) {
|
|
ekprintf("%s: ERROR: clear policy_range\n",
|
|
__func__);
|
|
goto unlock_out;
|
|
}
|
|
|
|
/* Add a new entry */
|
|
range_policy = kmalloc(sizeof(struct vm_range_numa_policy),
|
|
IHK_MC_AP_NOWAIT);
|
|
if (!range_policy) {
|
|
dkprintf("%s: error allocating range_policy\n",
|
|
__FUNCTION__);
|
|
error = -ENOMEM;
|
|
goto unlock_out;
|
|
}
|
|
|
|
RB_CLEAR_NODE(&range_policy->policy_rb_node);
|
|
range_policy->start = addr;
|
|
range_policy->end = addr + len;
|
|
|
|
error = vm_policy_insert(vm, range_policy);
|
|
if (error) {
|
|
kprintf("%s: ERROR: could not insert range: %d\n",__FUNCTION__, error);
|
|
goto unlock_out;
|
|
}
|
|
|
|
mbind_update_only:
|
|
if (mode == MPOL_DEFAULT) {
|
|
memset(range_policy->numa_mask, 0, sizeof(numa_mask));
|
|
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
|
|
set_bit(bit, range_policy->numa_mask);
|
|
}
|
|
}
|
|
else {
|
|
memcpy(range_policy->numa_mask, &numa_mask,
|
|
sizeof(numa_mask));
|
|
}
|
|
range_policy->numa_mem_policy = mode;
|
|
if (mode == MPOL_INTERLEAVE) {
|
|
range_policy->il_prev =
|
|
PROCESS_NUMA_MASK_BITS - 1;
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
error = -EINVAL;
|
|
goto unlock_out;
|
|
}
|
|
|
|
error = 0;
|
|
|
|
unlock_out:
|
|
ihk_rwspinlock_write_unlock_noirq(&vm->memory_range_lock);
|
|
out:
|
|
return error;
|
|
} /* sys_mbind() */
|
|
|
|
SYSCALL_DECLARE(set_mempolicy)
|
|
{
|
|
int mode = ihk_mc_syscall_arg0(ctx);
|
|
unsigned long *nodemask =
|
|
(unsigned long *)ihk_mc_syscall_arg1(ctx);
|
|
unsigned long maxnode = ihk_mc_syscall_arg2(ctx);
|
|
unsigned long nodemask_bits = 0;
|
|
struct process_vm *vm = cpu_local_var(current)->vm;
|
|
int error = 0;
|
|
int bit, valid_mask;
|
|
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
|
|
|
|
memset(numa_mask, 0, sizeof(numa_mask));
|
|
|
|
if (maxnode) {
|
|
nodemask_bits = ALIGN(maxnode, 8);
|
|
if (maxnode > (PAGE_SIZE << 3)) {
|
|
dkprintf("%s: ERROR: nodemask_bits bigger than PAGE_SIZE bits\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
|
|
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
|
|
__FUNCTION__);
|
|
nodemask_bits = PROCESS_NUMA_MASK_BITS;
|
|
}
|
|
}
|
|
|
|
if ((mode & MPOL_F_STATIC_NODES) &&
|
|
(mode & MPOL_F_RELATIVE_NODES)) {
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
mode &= ~MPOL_MODE_FLAGS;
|
|
|
|
switch (mode) {
|
|
case MPOL_DEFAULT:
|
|
if (nodemask && nodemask_bits) {
|
|
error = copy_from_user(numa_mask, nodemask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
if (!bitmap_empty(numa_mask, nodemask_bits)) {
|
|
dkprintf("%s: ERROR: nodemask not empty for MPOL_DEFAULT\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
memset(vm->numa_mask, 0, sizeof(numa_mask));
|
|
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
|
|
set_bit(bit, vm->numa_mask);
|
|
}
|
|
|
|
#if 0
|
|
/* In man, "MPOL_DEFAULT mode deletes a process memory policy
|
|
other than the default and interprets that the memory policy"
|
|
falls back to the system default policy ", but not to delete
|
|
the NUMA memory policy.
|
|
There was no processing of Linux's same name command. */
|
|
|
|
/* Delete all range settings */
|
|
ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock);
|
|
list_for_each_entry_safe(range_policy_iter, range_policy_next,
|
|
&vm->vm_range_numa_policy_list, list) {
|
|
list_del(&range_policy_iter->list);
|
|
kfree(range_policy_iter);
|
|
}
|
|
ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock);
|
|
#endif
|
|
|
|
vm->numa_mem_policy = mode;
|
|
error = 0;
|
|
break;
|
|
|
|
case MPOL_BIND:
|
|
case MPOL_INTERLEAVE:
|
|
case MPOL_PREFERRED:
|
|
/* Special case for MPOL_PREFERRED with empty nodemask */
|
|
if (mode == MPOL_PREFERRED && !nodemask) {
|
|
memset(vm->numa_mask, 0, sizeof(numa_mask));
|
|
for (bit = 0; bit < ihk_mc_get_nr_numa_nodes(); ++bit) {
|
|
set_bit(bit, vm->numa_mask);
|
|
}
|
|
|
|
vm->numa_mem_policy = mode;
|
|
error = 0;
|
|
break;
|
|
}
|
|
|
|
if (!nodemask) {
|
|
dkprintf("%s: ERROR: nodemask not specified\n",
|
|
__FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
error = copy_from_user(numa_mask, nodemask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
|
|
/* Verify NUMA mask */
|
|
valid_mask = 0;
|
|
for_each_set_bit(bit, numa_mask,
|
|
maxnode < PROCESS_NUMA_MASK_BITS ?
|
|
maxnode : PROCESS_NUMA_MASK_BITS) {
|
|
if (bit >= ihk_mc_get_nr_numa_nodes()) {
|
|
dkprintf("%s: %d is bigger than # of NUMA nodes\n",
|
|
__FUNCTION__, bit);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Is there at least one node which is allowed
|
|
* in current mask? */
|
|
if (test_bit(bit, vm->numa_mask)) {
|
|
valid_mask = 1;
|
|
}
|
|
}
|
|
|
|
if (!valid_mask) {
|
|
dkprintf("%s: ERROR: invalid nodemask\n", __FUNCTION__);
|
|
error = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Update current mask by clearing non-requested nodes */
|
|
for_each_set_bit(bit, vm->numa_mask,
|
|
maxnode < PROCESS_NUMA_MASK_BITS ?
|
|
maxnode : PROCESS_NUMA_MASK_BITS) {
|
|
if (!test_bit(bit, numa_mask)) {
|
|
clear_bit(bit, vm->numa_mask);
|
|
}
|
|
}
|
|
|
|
vm->numa_mem_policy = mode;
|
|
if (mode == MPOL_INTERLEAVE) {
|
|
vm->il_prev = PROCESS_NUMA_MASK_BITS - 1;
|
|
}
|
|
error = 0;
|
|
break;
|
|
|
|
default:
|
|
error = -EINVAL;
|
|
}
|
|
|
|
dkprintf("%s: %s set for PID %d\n",
|
|
__FUNCTION__,
|
|
mode == MPOL_DEFAULT ? "MPOL_DEFAULT" :
|
|
mode == MPOL_INTERLEAVE ? "MPOL_INTERLEAVE" :
|
|
mode == MPOL_BIND ? "MPOL_BIND" :
|
|
mode == MPOL_PREFERRED ? "MPOL_PREFERRED" :
|
|
"unknown",
|
|
cpu_local_var(current)->proc->pid);
|
|
|
|
out:
|
|
return error;
|
|
} /* sys_set_mempolicy() */
|
|
|
|
SYSCALL_DECLARE(get_mempolicy)
|
|
{
|
|
int *mode = (int *)ihk_mc_syscall_arg0(ctx);
|
|
unsigned long *nodemask =
|
|
(unsigned long *)ihk_mc_syscall_arg1(ctx);
|
|
unsigned long nodemask_bits = 0;
|
|
unsigned long maxnode = ihk_mc_syscall_arg2(ctx);
|
|
unsigned long addr = ihk_mc_syscall_arg3(ctx);
|
|
unsigned long flags = ihk_mc_syscall_arg4(ctx);
|
|
struct process_vm *vm = cpu_local_var(current)->vm;
|
|
struct vm_range_numa_policy *range_policy = NULL;
|
|
int error = 0;
|
|
int policy;
|
|
|
|
if ((!(flags & MPOL_F_ADDR) && addr) ||
|
|
(flags & ~(MPOL_F_ADDR | MPOL_F_NODE | MPOL_F_MEMS_ALLOWED)) ||
|
|
((flags & MPOL_F_NODE) && !(flags & MPOL_F_ADDR) &&
|
|
vm->numa_mem_policy == MPOL_INTERLEAVE)) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
/*
|
|
* XXX: man page claims the correct error code is EINVAL,
|
|
* but LTP tests for EFAULT.
|
|
*/
|
|
if ((flags & MPOL_F_ADDR) && !addr) {
|
|
return -EFAULT;
|
|
}
|
|
|
|
if (maxnode) {
|
|
if (maxnode < ihk_mc_get_nr_numa_nodes()) {
|
|
return -EINVAL;
|
|
}
|
|
|
|
nodemask_bits = ALIGN(maxnode, 8);
|
|
if (nodemask_bits > PROCESS_NUMA_MASK_BITS) {
|
|
dkprintf("%s: WARNING: process NUMA mask bits is insufficient\n",
|
|
__FUNCTION__);
|
|
nodemask_bits = PROCESS_NUMA_MASK_BITS;
|
|
}
|
|
}
|
|
|
|
/* case of MPOL_F_NODE and MPOL_F_ADDR are specified */
|
|
if (flags & MPOL_F_NODE && flags & MPOL_F_ADDR) {
|
|
/* return the node ID which addr is allocated by mode */
|
|
int nid;
|
|
|
|
nid = lookup_node(vm, (void *)addr);
|
|
error = copy_to_user(mode, &nid, sizeof(int));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
/* Special case of MPOL_F_MEMS_ALLOWED */
|
|
if (flags == MPOL_F_MEMS_ALLOWED) {
|
|
if (nodemask) {
|
|
error = copy_to_user(nodemask,
|
|
cpu_local_var(current)->vm->numa_mask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
}
|
|
}
|
|
|
|
goto out;
|
|
}
|
|
|
|
/* Address range specific? */
|
|
if (flags & MPOL_F_ADDR) {
|
|
struct vm_range *range;
|
|
|
|
ihk_rwspinlock_read_lock_noirq(&vm->memory_range_lock);
|
|
range = lookup_process_memory_range(vm, addr, addr + 1);
|
|
if (!range) {
|
|
dkprintf("%s: ERROR: range is invalid\n", __FUNCTION__);
|
|
error = -EFAULT;
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
goto out;
|
|
}
|
|
|
|
range_policy = vm_range_policy_search(vm, addr);
|
|
|
|
ihk_rwspinlock_read_unlock_noirq(&vm->memory_range_lock);
|
|
}
|
|
|
|
/* Return policy */
|
|
policy = range_policy ? range_policy->numa_mem_policy :
|
|
vm->numa_mem_policy;
|
|
|
|
if (mode) {
|
|
error = copy_to_user(mode, &policy, sizeof(int));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
if (nodemask && (policy != MPOL_DEFAULT)) {
|
|
error = copy_to_user(nodemask,
|
|
range_policy ? range_policy->numa_mask :
|
|
cpu_local_var(current)->vm->numa_mask,
|
|
(nodemask_bits >> 3));
|
|
if (error) {
|
|
error = -EFAULT;
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
out:
|
|
return error;
|
|
} /* sys_get_mempolicy() */
|
|
|
|
SYSCALL_DECLARE(migrate_pages)
|
|
{
|
|
dkprintf("sys_migrate_pages\n");
|
|
return -ENOSYS;
|
|
} /* sys_migrate_pages() */
|
|
|
|
SYSCALL_DECLARE(move_pages)
|
|
{
|
|
extern int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg);
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
unsigned long count = ihk_mc_syscall_arg1(ctx);
|
|
const void **user_virt_addr = (const void **)ihk_mc_syscall_arg2(ctx);
|
|
const int *user_nodes = (const int *)ihk_mc_syscall_arg3(ctx);
|
|
int *user_status = (int *)ihk_mc_syscall_arg4(ctx);
|
|
int flags = ihk_mc_syscall_arg5(ctx);
|
|
|
|
void **virt_addr = NULL;
|
|
int *nodes = NULL, *status = NULL;
|
|
int *nr_pages = NULL;
|
|
unsigned long *dst_phys = NULL;
|
|
pte_t **ptep = NULL;
|
|
struct move_pages_smp_req mpsr;
|
|
|
|
struct process_vm *vm = cpu_local_var(current)->vm;
|
|
int i, ret = 0;
|
|
|
|
unsigned long t_s, t_e;
|
|
|
|
t_s = rdtsc();
|
|
|
|
/* Only self is supported for now */
|
|
if (pid) {
|
|
kprintf("%s: ERROR: only self (pid == 0)"
|
|
" is supported\n", __FUNCTION__);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Check flags */
|
|
if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) {
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
if (flags & MPOL_MF_MOVE_ALL) {
|
|
kprintf("%s: ERROR: MPOL_MF_MOVE_ALL"
|
|
" not supported\n", __func__);
|
|
ret = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
/* Allocate kernel arrays */
|
|
virt_addr = kmalloc(sizeof(void *) * count, IHK_MC_AP_NOWAIT);
|
|
if (!virt_addr) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
nr_pages = kmalloc(sizeof(int) * count, IHK_MC_AP_NOWAIT);
|
|
if (!nr_pages) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
nodes = kmalloc(sizeof(int) * count, IHK_MC_AP_NOWAIT);
|
|
if (!nodes) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
status = kmalloc(sizeof(int) * count, IHK_MC_AP_NOWAIT);
|
|
if (!status) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
ptep = kmalloc(sizeof(pte_t) * count, IHK_MC_AP_NOWAIT);
|
|
if (!ptep) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
dst_phys = kmalloc(sizeof(unsigned long) * count, IHK_MC_AP_NOWAIT);
|
|
if (!dst_phys) {
|
|
ret = -ENOMEM;
|
|
goto dealloc_out;
|
|
}
|
|
t_e = rdtsc(); kprintf("%s: init malloc: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
|
|
/* Get virt addresses and NUMA node numbers from user */
|
|
if (verify_process_vm(cpu_local_var(current)->vm,
|
|
user_virt_addr, sizeof(void *) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
if (user_nodes && verify_process_vm(cpu_local_var(current)->vm,
|
|
user_nodes, sizeof(int) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
if (verify_process_vm(cpu_local_var(current)->vm,
|
|
user_status, sizeof(int) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
/* Check node ID */
|
|
if (user_nodes) {
|
|
copy_from_user(nodes, user_nodes, sizeof(int) * count);
|
|
for (i = 0; i < count; i++) {
|
|
if (nodes[i] < 0 || nodes[i] >= ihk_mc_get_nr_numa_nodes()) {
|
|
ret = -ENODEV;
|
|
goto dealloc_out;
|
|
}
|
|
}
|
|
}
|
|
|
|
t_e = rdtsc(); kprintf("%s: init verify: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
|
|
#if 0
|
|
memcpy(virt_addr, user_virt_addr, sizeof(void *) * count);
|
|
memcpy(status, user_status, sizeof(int) * count);
|
|
memcpy(nodes, user_nodes, sizeof(int) * count);
|
|
|
|
if (copy_from_user(virt_addr, user_virt_addr,
|
|
sizeof(void *) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
if (copy_from_user(nodes, user_nodes, sizeof(int) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
/* Won't use it but better to verify the user buffer before
|
|
* doing anything.. */
|
|
if (copy_from_user(status, user_status, sizeof(int) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
t_e = rdtsc(); kprintf("%s: init copy: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
|
|
/* Verify target NUMA nodes are valid */
|
|
for (i = 0; i < count; i++) {
|
|
if (nodes[i] < 0 ||
|
|
nodes[i] >= ihk_mc_get_nr_numa_nodes() ||
|
|
!test_bit(nodes[i], vm->numa_mask)) {
|
|
ret = -EINVAL;
|
|
goto dealloc_out;
|
|
}
|
|
}
|
|
|
|
t_e = rdtsc(); kprintf("%s: init NUMAver: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
|
|
memset(ptep, 0, sizeof(pte_t) * count);
|
|
memset(status, 0, sizeof(int) * count);
|
|
memset(nr_pages, 0, sizeof(int) * count);
|
|
memset(dst_phys, 0, sizeof(unsigned long) * count);
|
|
|
|
t_e = rdtsc(); kprintf("%s: init memset: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
#endif
|
|
|
|
ihk_mc_spinlock_lock_noirq(&vm->page_table_lock);
|
|
|
|
/* Do the arg init, NUMA verification, copy,
|
|
* update PTEs, free original memory */
|
|
mpsr.count = count;
|
|
mpsr.user_virt_addr = user_virt_addr;
|
|
mpsr.user_status = user_status;
|
|
mpsr.user_nodes = user_nodes;
|
|
mpsr.virt_addr = virt_addr;
|
|
mpsr.status = status;
|
|
mpsr.nodes = nodes;
|
|
mpsr.nodes_ready = 0;
|
|
mpsr.ptep = ptep;
|
|
mpsr.dst_phys = dst_phys;
|
|
mpsr.nr_pages = nr_pages;
|
|
mpsr.proc = cpu_local_var(current)->proc;
|
|
ihk_atomic_set(&mpsr.phase_done, 0);
|
|
mpsr.phase_ret = 0;
|
|
ret = smp_call_func(&cpu_local_var(current)->cpu_set,
|
|
move_pages_smp_handler, &mpsr);
|
|
|
|
ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock);
|
|
|
|
if (ret != 0) {
|
|
goto dealloc_out;
|
|
}
|
|
|
|
t_e = rdtsc(); kprintf("%s: parallel: %lu \n", __FUNCTION__, t_e - t_s); t_s = t_e;
|
|
|
|
/* This shouldn't fail (verified above) */
|
|
if (copy_to_user(user_status, status, sizeof(int) * count)) {
|
|
ret = -EFAULT;
|
|
goto dealloc_out;
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
dealloc_out:
|
|
kfree(virt_addr);
|
|
kfree(nr_pages);
|
|
kfree(nodes);
|
|
kfree(status);
|
|
kfree(ptep);
|
|
kfree(dst_phys);
|
|
|
|
out:
|
|
return ret;
|
|
}
|
|
|
|
extern int do_process_vm_read_writev(int pid,
|
|
const struct iovec *local_iov,
|
|
unsigned long liovcnt,
|
|
const struct iovec *remote_iov,
|
|
unsigned long riovcnt,
|
|
unsigned long flags,
|
|
int op);
|
|
|
|
SYSCALL_DECLARE(process_vm_writev)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
const struct iovec *local_iov =
|
|
(const struct iovec *)ihk_mc_syscall_arg1(ctx);
|
|
unsigned long liovcnt = ihk_mc_syscall_arg2(ctx);
|
|
const struct iovec *remote_iov =
|
|
(const struct iovec *)ihk_mc_syscall_arg3(ctx);
|
|
unsigned long riovcnt = ihk_mc_syscall_arg4(ctx);
|
|
unsigned long flags = ihk_mc_syscall_arg5(ctx);
|
|
|
|
return do_process_vm_read_writev(pid, local_iov, liovcnt,
|
|
remote_iov, riovcnt, flags, PROCESS_VM_WRITE);
|
|
}
|
|
|
|
SYSCALL_DECLARE(process_vm_readv)
|
|
{
|
|
int pid = ihk_mc_syscall_arg0(ctx);
|
|
const struct iovec *local_iov =
|
|
(const struct iovec *)ihk_mc_syscall_arg1(ctx);
|
|
unsigned long liovcnt = ihk_mc_syscall_arg2(ctx);
|
|
const struct iovec *remote_iov =
|
|
(const struct iovec *)ihk_mc_syscall_arg3(ctx);
|
|
unsigned long riovcnt = ihk_mc_syscall_arg4(ctx);
|
|
unsigned long flags = ihk_mc_syscall_arg5(ctx);
|
|
|
|
return do_process_vm_read_writev(pid, local_iov, liovcnt,
|
|
remote_iov, riovcnt, flags, PROCESS_VM_READ);
|
|
}
|
|
|
|
#ifdef DCFA_KMOD
|
|
|
|
#ifdef CMD_DCFA
|
|
extern int ibmic_cmd_syscall(char *uargs);
|
|
extern void ibmic_cmd_exit(int status);
|
|
#endif
|
|
|
|
#ifdef CMD_DCFAMPI
|
|
extern int dcfampi_cmd_syscall(char *uargs);
|
|
#endif
|
|
|
|
static int (*mod_call_table[]) (char *) = {
|
|
#ifdef CMD_DCFA
|
|
[1] = ibmic_cmd_syscall,
|
|
#endif
|
|
#ifdef CMD_DCFAMPI
|
|
[2] = dcfampi_cmd_syscall,
|
|
#endif
|
|
};
|
|
|
|
static void (*mod_exit_table[]) (int) = {
|
|
#ifdef CMD_DCFA
|
|
[1] = ibmic_cmd_exit,
|
|
#endif
|
|
#ifdef CMD_DCFAMPI
|
|
[2] = NULL,
|
|
#endif
|
|
};
|
|
|
|
SYSCALL_DECLARE(mod_call) {
|
|
int mod_id;
|
|
unsigned long long uargs;
|
|
|
|
mod_id = ihk_mc_syscall_arg0(ctx);
|
|
uargs = ihk_mc_syscall_arg1(ctx);
|
|
|
|
dkprintf("mod_call id:%d, uargs=0x%llx, type=%s, command=%x\n", mod_id, uargs, mod_id==1?"ibmic":"dcfampi", *((uint32_t*)(((char*)uargs)+0)));
|
|
|
|
if(mod_call_table[mod_id])
|
|
return mod_call_table[mod_id]((char*)uargs);
|
|
|
|
kprintf("ERROR! undefined mod_call id:%d\n", mod_id);
|
|
|
|
return -ENOSYS;
|
|
}
|
|
|
|
static void do_mod_exit(int status){
|
|
int i;
|
|
for(i=1; i<=2; i++){
|
|
if(mod_exit_table[i])
|
|
mod_exit_table[i](status);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
extern void save_uctx(void *, void *);
|
|
|
|
/* TODO: use copy_from_user() */
|
|
int util_show_syscall_profile()
|
|
{
|
|
int i;
|
|
struct uti_desc *desc = (struct uti_desc *)uti_desc;
|
|
|
|
kprintf("Syscall stats for offloaded thread:\n");
|
|
for (i = 0; i < 512; i++) {
|
|
if (desc->syscalls[i]) {
|
|
kprintf("nr=%d #called=%ld\n", i, desc->syscalls[i]);
|
|
}
|
|
}
|
|
|
|
kprintf("Syscall stats for other threads:\n");
|
|
for (i = 0; i < 512; i++) {
|
|
if (desc->syscalls2[i]) {
|
|
kprintf("nr=%d #called=%ld\n", i, desc->syscalls2[i]);
|
|
}
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int util_thread(struct uti_attr *arg)
|
|
{
|
|
struct uti_ctx *rctx = NULL;
|
|
unsigned long rp_rctx;
|
|
struct uti_info *uti_info = NULL;
|
|
struct syscall_request request IHK_DMA_ALIGN;
|
|
long rc;
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct kuti_attr {
|
|
long parent_cpuid;
|
|
struct uti_attr attr;
|
|
} kattr;
|
|
|
|
thread->uti_state = UTI_STATE_PROLOGUE;
|
|
|
|
rctx = kmalloc(sizeof(struct uti_ctx), IHK_MC_AP_NOWAIT);
|
|
if (!rctx) {
|
|
rc = -ENOMEM;
|
|
goto out;
|
|
}
|
|
rp_rctx = virt_to_phys((void *)rctx);
|
|
save_uctx((void *)rctx->ctx, NULL);
|
|
|
|
/* Create a information for Linux thread */
|
|
uti_info = kmalloc(sizeof(struct uti_info), IHK_MC_AP_NOWAIT);
|
|
if (!uti_info) {
|
|
rc = -ENOMEM;
|
|
goto out;
|
|
}
|
|
/* clv info */
|
|
uti_info->thread_va = (unsigned long)cpu_local_var(current);
|
|
uti_info->uti_futex_resp_pa = virt_to_phys((void *)cpu_local_var(uti_futex_resp));
|
|
uti_info->ikc2linux_pa = virt_to_phys((void *)cpu_local_var(ikc2linux));
|
|
|
|
/* thread info */
|
|
uti_info->tid = thread->tid;
|
|
uti_info->cpu = ihk_mc_get_processor_id();
|
|
uti_info->status_pa = virt_to_phys((void *)&thread->status);
|
|
uti_info->spin_sleep_lock_pa = virt_to_phys((void *)&thread->spin_sleep_lock);
|
|
uti_info->spin_sleep_pa = virt_to_phys((void *)&thread->spin_sleep);
|
|
uti_info->vm_pa = virt_to_phys((void *)thread->vm);
|
|
uti_info->futex_q_pa = virt_to_phys((void *)&thread->futex_q);
|
|
|
|
/* global info */
|
|
uti_info->mc_idle_halt = idle_halt;
|
|
uti_info->futex_queue_pa = virt_to_phys((void *)get_futex_queues());
|
|
|
|
request.number = __NR_sched_setaffinity;
|
|
request.args[0] = 0;
|
|
request.args[1] = rp_rctx;
|
|
request.args[2] = 0;
|
|
if (arg) {
|
|
memcpy(&kattr.attr, arg, sizeof(struct uti_attr));
|
|
kattr.parent_cpuid = thread->parent_cpuid;
|
|
request.args[2] = virt_to_phys(&kattr);
|
|
}
|
|
request.args[3] = (unsigned long)uti_info;
|
|
request.args[4] = uti_desc;
|
|
thread->uti_state = UTI_STATE_RUNNING_IN_LINUX;
|
|
rc = do_syscall(&request, ihk_mc_get_processor_id());
|
|
dkprintf("%s: returned from do_syscall,tid=%d,rc=%lx\n", __FUNCTION__, thread->tid, rc);
|
|
|
|
thread->uti_state = UTI_STATE_EPILOGUE;
|
|
|
|
util_show_syscall_profile();
|
|
|
|
/* Save it before freed */
|
|
thread->uti_refill_tid = rctx->uti_refill_tid;
|
|
dkprintf("%s: mcexec worker tid=%d\n", __FUNCTION__, thread->uti_refill_tid);
|
|
|
|
kfree(rctx);
|
|
rctx = NULL;
|
|
|
|
kfree(uti_info);
|
|
uti_info = NULL;
|
|
|
|
if (rc >= 0) {
|
|
if (rc & 0x100000000) { /* exit_group */
|
|
dkprintf("%s: exit_group, tid=%d,rc=%lx\n", __FUNCTION__, thread->tid, rc);
|
|
thread->proc->nohost = 1;
|
|
terminate((rc >> 8) & 255, rc & 255);
|
|
} else {
|
|
/* exit or killed-by-signal detected */
|
|
dkprintf("%s: exit or killed by signal, pid=%d,tid=%d,rc=%lx\n", __FUNCTION__, thread->proc->pid, thread->tid, rc);
|
|
do_exit(rc);
|
|
}
|
|
} else if (rc == -ERESTARTSYS) {
|
|
/* tracer is not working and /dev/mcosX has detected exit of mcexec process */
|
|
kprintf("%s: release_handler,pid=%d,tid=%d,rc=%lx\n", __FUNCTION__, thread->proc->pid, thread->tid, rc);
|
|
thread->proc->nohost = 1;
|
|
do_exit(rc);
|
|
} else {
|
|
kprintf("%s: ERROR: do_syscall() failed (%ld)\n", __FUNCTION__, rc);
|
|
}
|
|
|
|
out:
|
|
kfree(rctx);
|
|
kfree(uti_info);
|
|
|
|
return rc;
|
|
}
|
|
|
|
void
|
|
utilthr_migrate()
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
|
|
/* Don't inherit mod_clone */
|
|
if (thread->mod_clone == SPAWNING_TO_REMOTE) {
|
|
thread->mod_clone = SPAWN_TO_LOCAL;
|
|
util_thread(thread->mod_clone_arg);
|
|
}
|
|
}
|
|
|
|
SYSCALL_DECLARE(util_migrate_inter_kernel)
|
|
{
|
|
struct uti_attr *arg = (void *)ihk_mc_syscall_arg0(ctx);
|
|
struct uti_attr kattr;
|
|
|
|
if (arg) {
|
|
if (copy_from_user(&kattr, arg, sizeof(struct uti_attr))) {
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
|
|
return util_thread(arg? &kattr: NULL);
|
|
}
|
|
|
|
SYSCALL_DECLARE(util_indicate_clone)
|
|
{
|
|
int mod = (int)ihk_mc_syscall_arg0(ctx);
|
|
struct uti_attr *arg = (void *)ihk_mc_syscall_arg1(ctx);
|
|
struct thread *thread = cpu_local_var(current);
|
|
struct uti_attr *kattr = NULL;
|
|
|
|
if (!thread->proc->enable_uti) {
|
|
kprintf("%s: error: --enable-uti mcexec option not specified\n",
|
|
__func__);
|
|
return -EINVAL;
|
|
}
|
|
|
|
if (mod != SPAWN_TO_LOCAL &&
|
|
mod != SPAWN_TO_REMOTE)
|
|
return -EINVAL;
|
|
if (arg) {
|
|
if (!(kattr = kmalloc(sizeof(struct uti_attr), IHK_MC_AP_NOWAIT))) {
|
|
kprintf("%s: error: allocating kattr\n", __func__);
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (copy_from_user(kattr, arg, sizeof(struct uti_attr))) {
|
|
kfree(kattr);
|
|
return -EFAULT;
|
|
}
|
|
}
|
|
thread->mod_clone = mod;
|
|
if (thread->mod_clone_arg) {
|
|
kfree(thread->mod_clone_arg);
|
|
thread->mod_clone_arg = NULL;
|
|
}
|
|
if (kattr) {
|
|
thread->mod_clone_arg = kattr;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(get_system)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* swapoout(const char *filename, void *workarea, size_t size)
|
|
*/
|
|
SYSCALL_DECLARE(swapout)
|
|
{
|
|
extern int do_pageout(const char*, void*, size_t, int);
|
|
extern int do_pagein(int);
|
|
char *fname = (char *)ihk_mc_syscall_arg0(ctx);
|
|
char *buf = (char *)ihk_mc_syscall_arg1(ctx);
|
|
size_t size = (size_t)ihk_mc_syscall_arg2(ctx);
|
|
int flag = (int)ihk_mc_syscall_arg3(ctx);
|
|
ihk_mc_user_context_t ctx0;
|
|
int cc;
|
|
|
|
dkprintf("[%d]swapout(%lx,%lx,%lx,%ld)\n",
|
|
ihk_mc_get_processor_id(), fname, buf, size, flag);
|
|
|
|
if (fname == NULL || flag == 0x01) { /* for development purupse */
|
|
kprintf("swapout: skipping real swap\n");
|
|
cc = syscall_generic_forwarding(__NR_swapout, &ctx0);
|
|
kprintf("swapout: return from Linux\n");
|
|
return cc;
|
|
}
|
|
/* pageout */
|
|
cc = do_pageout(fname, buf, size, flag);
|
|
if (cc < 0) return cc;
|
|
if (flag == 0x02) {
|
|
kprintf("swapout: skipping calling swapout in Linux\n");
|
|
} else {
|
|
kprintf("swapout: before calling swapout in Linux\n");
|
|
cc = syscall_generic_forwarding(__NR_swapout, &ctx0);
|
|
kprintf("swapout: after calling swapout in Linux cc(%d)\n", cc);
|
|
}
|
|
/* Though swapout in Linux side returns error, needs to call
|
|
* pagein to recover the image */
|
|
cc = do_pagein(flag);
|
|
kprintf("swapout: after calling do_pagein cc(%d)\n", cc);
|
|
return cc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(linux_mlock)
|
|
{
|
|
ihk_mc_user_context_t ctx0;
|
|
const uintptr_t addr = ihk_mc_syscall_arg0(ctx);
|
|
const size_t len = ihk_mc_syscall_arg1(ctx);
|
|
int cc;
|
|
|
|
kprintf("linux_mlock: %p %ld\n", (void*) addr, len);
|
|
ihk_mc_syscall_arg0(&ctx0) = addr;
|
|
ihk_mc_syscall_arg1(&ctx0) = len;
|
|
cc = syscall_generic_forwarding(802, &ctx0);
|
|
return cc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(linux_spawn)
|
|
{
|
|
int rc;
|
|
|
|
rc = syscall_generic_forwarding(__NR_linux_spawn, ctx);
|
|
return rc;
|
|
}
|
|
|
|
SYSCALL_DECLARE(suspend_threads)
|
|
{
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct thread *thread;
|
|
struct process *proc = mythread->proc;
|
|
|
|
list_for_each_entry(thread, &proc->threads_list, siblings_list) {
|
|
if (thread == mythread)
|
|
continue;
|
|
do_kill(mythread, proc->pid, thread->tid, SIGSTOP, NULL, 0);
|
|
}
|
|
list_for_each_entry(thread, &proc->threads_list, siblings_list) {
|
|
if (thread == mythread)
|
|
continue;
|
|
while (thread->status != PS_STOPPED)
|
|
cpu_pause();
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(resume_threads)
|
|
{
|
|
struct thread *mythread = cpu_local_var(current);
|
|
struct thread *thread;
|
|
struct process *proc = mythread->proc;
|
|
|
|
list_for_each_entry(thread, &proc->threads_list, siblings_list) {
|
|
if (thread == mythread)
|
|
continue;
|
|
do_kill(mythread, proc->pid, thread->tid, SIGCONT, NULL, 0);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DECLARE(util_register_desc)
|
|
{
|
|
struct thread *thread = cpu_local_var(current);
|
|
uti_desc = ihk_mc_syscall_arg0(ctx);
|
|
dkprintf("%s: tid=%d,uti_desc=%lx\n", __FUNCTION__, thread->tid, uti_desc);
|
|
return 0;
|
|
}
|
|
|
|
void
|
|
reset_cputime()
|
|
{
|
|
struct thread *thread;
|
|
|
|
if(clv == NULL)
|
|
return;
|
|
|
|
if(!(thread = cpu_local_var(current)))
|
|
return;
|
|
|
|
thread->base_tsc = 0;
|
|
}
|
|
|
|
void
|
|
set_cputime(enum set_cputime_mode mode)
|
|
{
|
|
struct thread *thread;
|
|
unsigned long tsc;
|
|
struct cpu_local_var *v;
|
|
struct ihk_os_cpu_monitor *monitor;
|
|
unsigned long irq_flags = 0;
|
|
|
|
if(clv == NULL)
|
|
return;
|
|
|
|
v = get_this_cpu_local_var();
|
|
if(!(thread = v->current))
|
|
return;
|
|
if(thread == &v->idle)
|
|
return;
|
|
monitor = v->monitor;
|
|
if (mode == CPUTIME_MODE_K2U) {
|
|
monitor->status = IHK_OS_MONITOR_USER;
|
|
}
|
|
else if (mode == CPUTIME_MODE_U2K) {
|
|
monitor->counter++;
|
|
monitor->status = IHK_OS_MONITOR_KERNEL;
|
|
}
|
|
|
|
if(!gettime_local_support){
|
|
thread->times_update = 1;
|
|
return;
|
|
}
|
|
|
|
irq_flags = cpu_disable_interrupt_save();
|
|
tsc = rdtsc();
|
|
if(thread->base_tsc != 0){
|
|
unsigned long dtsc = tsc - thread->base_tsc;
|
|
struct timespec dts;
|
|
|
|
tsc_to_ts(dtsc, &dts);
|
|
if (mode == CPUTIME_MODE_U2K) {
|
|
thread->user_tsc += dtsc;
|
|
v->rusage->user_tsc += dtsc;
|
|
ts_add(&thread->itimer_virtual_value, &dts);
|
|
ts_add(&thread->itimer_prof_value, &dts);
|
|
}
|
|
else{
|
|
thread->system_tsc += dtsc;
|
|
v->rusage->system_tsc += dtsc;
|
|
ts_add(&thread->itimer_prof_value, &dts);
|
|
}
|
|
}
|
|
|
|
thread->base_tsc = tsc;
|
|
|
|
thread->times_update = 1;
|
|
thread->in_kernel = (int)mode;
|
|
|
|
if(thread->itimer_enabled){
|
|
struct timeval tv;
|
|
int ev = 0;
|
|
|
|
if(thread->itimer_virtual.it_value.tv_sec != 0 ||
|
|
thread->itimer_virtual.it_value.tv_usec){
|
|
ts_to_tv(&tv, &thread->itimer_virtual_value);
|
|
tv_sub(&tv, &thread->itimer_virtual.it_value);
|
|
if(tv.tv_sec > 0 ||
|
|
(tv.tv_sec == 0 &&
|
|
tv.tv_usec > 0)){
|
|
thread->itimer_virtual_value.tv_sec = 0;
|
|
thread->itimer_virtual_value.tv_nsec = 0;
|
|
thread->itimer_virtual.it_value.tv_sec =
|
|
thread->itimer_virtual.it_interval.tv_sec;
|
|
thread->itimer_virtual.it_value.tv_usec =
|
|
thread->itimer_virtual.it_interval.tv_usec;
|
|
do_kill(thread, thread->proc->pid, thread->tid,
|
|
SIGVTALRM, NULL, 0);
|
|
ev = 1;
|
|
}
|
|
}
|
|
|
|
if(thread->itimer_prof.it_value.tv_sec != 0 ||
|
|
thread->itimer_prof.it_value.tv_usec){
|
|
ts_to_tv(&tv, &thread->itimer_prof_value);
|
|
tv_sub(&tv, &thread->itimer_prof.it_value);
|
|
if(tv.tv_sec > 0 ||
|
|
(tv.tv_sec == 0 &&
|
|
tv.tv_usec > 0)){
|
|
thread->itimer_prof_value.tv_sec = 0;
|
|
thread->itimer_prof_value.tv_nsec = 0;
|
|
thread->itimer_prof.it_value.tv_sec =
|
|
thread->itimer_prof.it_interval.tv_sec;
|
|
thread->itimer_prof.it_value.tv_usec =
|
|
thread->itimer_prof.it_interval.tv_usec;
|
|
do_kill(thread, thread->proc->pid, thread->tid,
|
|
SIGPROF, NULL, 0);
|
|
ev = 1;
|
|
}
|
|
}
|
|
if(ev){
|
|
if(thread->itimer_virtual.it_value.tv_sec == 0 &&
|
|
thread->itimer_virtual.it_value.tv_usec == 0 &&
|
|
thread->itimer_prof.it_value.tv_sec == 0 &&
|
|
thread->itimer_prof.it_value.tv_usec == 0){
|
|
thread->itimer_enabled = 0;
|
|
set_timer(0);
|
|
}
|
|
}
|
|
}
|
|
cpu_restore_interrupt(irq_flags);
|
|
}
|
|
|
|
long syscall(int num, ihk_mc_user_context_t *ctx)
|
|
{
|
|
long l;
|
|
struct cpu_local_var *v = get_this_cpu_local_var();
|
|
struct thread *thread = v->current;
|
|
|
|
#ifdef DISABLE_SCHED_YIELD
|
|
if (num != __NR_sched_yield)
|
|
#endif // DISABLE_SCHED_YIELD
|
|
set_cputime(CPUTIME_MODE_U2K);
|
|
|
|
//kprintf("syscall=%d\n", num);
|
|
#ifdef PROFILE_ENABLE
|
|
if (thread->profile && thread->profile_start_ts) {
|
|
unsigned long ts = rdtsc();
|
|
thread->profile_elapsed_ts += (ts - thread->profile_start_ts);
|
|
thread->profile_start_ts = ts;
|
|
}
|
|
#endif // PROFILE_ENABLE
|
|
|
|
if(cpu_local_var(current)->proc->status == PS_EXITED &&
|
|
(num != __NR_exit && num != __NR_exit_group)){
|
|
/* x86_64: Setting -EINVAL to rax is done in the
|
|
* following return.
|
|
*/
|
|
save_syscall_return_value(num, -EINVAL);
|
|
check_signal(-EINVAL, NULL, -1);
|
|
set_cputime(CPUTIME_MODE_K2U);
|
|
return -EINVAL;
|
|
}
|
|
|
|
cpu_enable_interrupt();
|
|
|
|
if (cpu_local_var(current)->ptrace) {
|
|
/*
|
|
* XXX: After PTRACE_EVENT_EXEC we need to report an extra SIGTRAP.
|
|
* This is a tmp fix and should be moved into ptrace_report_exec()
|
|
*/
|
|
if (cpu_local_var(current)->ptrace & PT_TRACED_AFTER_EXEC) {
|
|
arch_ptrace_syscall_event(cpu_local_var(current), ctx, 0);
|
|
cpu_local_var(current)->ptrace &= ~(PT_TRACED_AFTER_EXEC);
|
|
}
|
|
|
|
arch_ptrace_syscall_event(cpu_local_var(current),
|
|
ctx, -ENOSYS);
|
|
num = ihk_mc_syscall_number(ctx);
|
|
}
|
|
|
|
#if 0
|
|
if(num != 24) // if not sched_yield
|
|
#endif
|
|
dkprintf("SC(%d:%d)[%3d=%s](%lx, %lx,%lx, %lx, %lx, %lx)@%lx,sp:%lx",
|
|
ihk_mc_get_processor_id(),
|
|
ihk_mc_get_hardware_processor_id(),
|
|
num, syscall_name[num],
|
|
ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
|
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
|
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_arg5(ctx),
|
|
ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
|
|
#if 1
|
|
#if 0
|
|
if(num != 24) // if not sched_yield
|
|
#endif
|
|
dkprintf(",*sp:%lx,*(sp+8):%lx,*(sp+16):%lx,*(sp+24):%lx",
|
|
*((unsigned long*)ihk_mc_syscall_sp(ctx)),
|
|
*((unsigned long*)(ihk_mc_syscall_sp(ctx)+8)),
|
|
*((unsigned long*)(ihk_mc_syscall_sp(ctx)+16)),
|
|
*((unsigned long*)(ihk_mc_syscall_sp(ctx)+24)));
|
|
#endif
|
|
#if 0
|
|
if(num != 24) // if not sched_yield
|
|
#endif
|
|
dkprintf("\n");
|
|
|
|
if ((0 <= num) && (num < (sizeof(syscall_table) / sizeof(syscall_table[0])))
|
|
&& (syscall_table[num] != NULL)) {
|
|
l = syscall_table[num](num, ctx);
|
|
|
|
dkprintf("SC(%d)[%3d] ret: %lx\n",
|
|
ihk_mc_get_processor_id(), num, l);
|
|
} else {
|
|
dkprintf("USC[%3d](%lx, %lx, %lx, %lx, %lx) @ %lx | %lx\n", num,
|
|
ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
|
|
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
|
|
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
|
|
ihk_mc_syscall_sp(ctx));
|
|
l = syscall_generic_forwarding(num, ctx);
|
|
}
|
|
|
|
/* Store return value so that PTRACE_GETREGSET will see it */
|
|
save_syscall_return_value(num, l);
|
|
|
|
if (cpu_local_var(current)->ptrace) {
|
|
/* arm64: The return value modified by the tracer is
|
|
* stored to x0 in the following check_signal().
|
|
*/
|
|
l = arch_ptrace_syscall_event(cpu_local_var(current), ctx, l);
|
|
}
|
|
|
|
#ifdef PROFILE_ENABLE
|
|
{
|
|
unsigned long ts = rdtsc();
|
|
|
|
/*
|
|
* futex_wait() and schedule() will internally reset
|
|
* thread->profile_start_ts so that actual wait time
|
|
* is not accounted for.
|
|
*/
|
|
if (num < PROFILE_SYSCALL_MAX) {
|
|
profile_event_add(num, (ts - thread->profile_start_ts));
|
|
thread->profile_start_ts = rdtsc();
|
|
}
|
|
else {
|
|
if (num != __NR_profile) {
|
|
dkprintf("%s: syscall > %d ?? : %d\n",
|
|
__FUNCTION__, PROFILE_SYSCALL_MAX, num);
|
|
}
|
|
}
|
|
}
|
|
#endif // PROFILE_ENABLE
|
|
|
|
#ifdef ENABLE_FUGAKU_HACKS
|
|
/* Do not deschedule when returning from an event (e.g., MPI) */
|
|
if (!(num == __NR_epoll_wait ||
|
|
num == __NR_epoll_pwait ||
|
|
num == __NR_ppoll) &&
|
|
smp_load_acquire(&v->flags) & CPU_FLAG_NEED_RESCHED)
|
|
#else
|
|
if (smp_load_acquire(&v->flags) & CPU_FLAG_NEED_RESCHED)
|
|
#endif
|
|
{
|
|
check_need_resched();
|
|
}
|
|
|
|
if (!list_empty(&thread->sigpending) ||
|
|
!list_empty(&thread->sigcommon->sigpending)) {
|
|
check_signal(l, NULL, num);
|
|
}
|
|
|
|
#ifdef DISABLE_SCHED_YIELD
|
|
if (num != __NR_sched_yield)
|
|
#endif // DISABLE_SCHED_YIELD
|
|
set_cputime(CPUTIME_MODE_K2U);
|
|
|
|
if (thread->proc->nohost) { // mcexec termination was detected
|
|
terminate(0, SIGKILL);
|
|
}
|
|
//kprintf("syscall=%d returns %lx(%ld)\n", num, l, l);
|
|
|
|
return l;
|
|
}
|
|
|
|
void
|
|
check_sig_pending()
|
|
{
|
|
int found = 0;
|
|
struct list_head *head;
|
|
mcs_rwlock_lock_t *lock;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
struct sig_pending *next;
|
|
struct sig_pending *pending;
|
|
__sigset_t w;
|
|
__sigset_t x;
|
|
int sig = 0;
|
|
struct k_sigaction *k;
|
|
struct thread *thread;
|
|
|
|
if (clv == NULL)
|
|
return;
|
|
|
|
thread = cpu_local_var(current);
|
|
if (thread == NULL || thread == &cpu_local_var(idle)) {
|
|
return;
|
|
}
|
|
if (thread->in_syscall_offload == 0) {
|
|
return;
|
|
}
|
|
if (thread->proc->group_exit_status & 0x0000000100000000L) {
|
|
return;
|
|
}
|
|
|
|
w = thread->sigmask.__val[0];
|
|
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
for (;;) {
|
|
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
|
|
|
list_for_each_entry_safe(pending, next, head, list) {
|
|
for (x = pending->sigmask.__val[0], sig = 0; x;
|
|
sig++, x >>= 1) {
|
|
}
|
|
k = thread->sigcommon->action + sig - 1;
|
|
if ((sig != SIGCHLD &&
|
|
sig != SIGURG &&
|
|
sig != SIGCONT) ||
|
|
(k->sa.sa_handler != SIG_IGN &&
|
|
k->sa.sa_handler != NULL)) {
|
|
if (!(pending->sigmask.__val[0] & w)) {
|
|
if (pending->interrupted == 0) {
|
|
pending->interrupted = 1;
|
|
found = 1;
|
|
if (sig != SIGCHLD &&
|
|
sig != SIGURG &&
|
|
sig != SIGCONT &&
|
|
!k->sa.sa_handler) {
|
|
found = 2;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
|
|
|
if (found == 2) {
|
|
break;
|
|
}
|
|
|
|
if (lock == &thread->sigpendinglock) {
|
|
break;
|
|
}
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
}
|
|
|
|
if (found == 2) {
|
|
terminate_mcexec(0, sig);
|
|
return;
|
|
}
|
|
else if (found == 1) {
|
|
interrupt_syscall(thread, 0);
|
|
return;
|
|
}
|
|
return;
|
|
}
|
|
|
|
struct sig_pending *
|
|
getsigpending(struct thread *thread, int delflag)
|
|
{
|
|
struct list_head *head;
|
|
mcs_rwlock_lock_t *lock;
|
|
struct mcs_rwlock_node_irqsave mcs_rw_node;
|
|
struct sig_pending *next;
|
|
struct sig_pending *pending;
|
|
__sigset_t w;
|
|
__sigset_t x;
|
|
int sig;
|
|
struct k_sigaction *k;
|
|
|
|
w = thread->sigmask.__val[0];
|
|
|
|
lock = &thread->sigcommon->lock;
|
|
head = &thread->sigcommon->sigpending;
|
|
for (;;) {
|
|
if (delflag) {
|
|
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
|
|
}
|
|
else {
|
|
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
|
|
}
|
|
|
|
list_for_each_entry_safe(pending, next, head, list) {
|
|
for (x = pending->sigmask.__val[0], sig = 0; x;
|
|
sig++, x >>= 1) {
|
|
}
|
|
k = thread->sigcommon->action + sig - 1;
|
|
if (delflag ||
|
|
(sig != SIGCHLD &&
|
|
sig != SIGURG &&
|
|
sig != SIGCONT) ||
|
|
(k->sa.sa_handler != (void *)1 &&
|
|
k->sa.sa_handler != NULL)){
|
|
if (!(pending->sigmask.__val[0] & w)) {
|
|
if (delflag)
|
|
list_del(&pending->list);
|
|
|
|
if (delflag) {
|
|
mcs_rwlock_writer_unlock(
|
|
lock,
|
|
&mcs_rw_node);
|
|
}
|
|
else {
|
|
mcs_rwlock_reader_unlock(
|
|
lock,
|
|
&mcs_rw_node);
|
|
}
|
|
return pending;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (delflag) {
|
|
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
|
|
}
|
|
else {
|
|
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
|
|
}
|
|
|
|
if (lock == &thread->sigpendinglock) {
|
|
return NULL;
|
|
}
|
|
|
|
lock = &thread->sigpendinglock;
|
|
head = &thread->sigpending;
|
|
}
|
|
|
|
return NULL;
|
|
}
|
|
|
|
struct sig_pending *
|
|
hassigpending(struct thread *thread)
|
|
{
|
|
if (list_empty(&thread->sigpending) &&
|
|
list_empty(&thread->sigcommon->sigpending)) {
|
|
return NULL;
|
|
}
|
|
|
|
return getsigpending(thread, 0);
|
|
}
|
|
|
|
static void
|
|
__check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
|
|
{
|
|
ihk_mc_user_context_t *regs = regs0;
|
|
struct thread *thread;
|
|
struct sig_pending *pending;
|
|
int irqstate;
|
|
|
|
if (clv == NULL) {
|
|
return;
|
|
}
|
|
thread = cpu_local_var(current);
|
|
|
|
if (thread == NULL || thread->proc->pid == 0) {
|
|
struct thread *t;
|
|
|
|
irqstate = cpu_disable_interrupt_save();
|
|
ihk_mc_spinlock_lock_noirq(&(cpu_local_var(runq_lock)));
|
|
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list) {
|
|
if (t->proc->pid <= 0) {
|
|
continue;
|
|
}
|
|
if (t->status == PS_INTERRUPTIBLE &&
|
|
hassigpending(t)) {
|
|
t->status = PS_RUNNING;
|
|
break;
|
|
}
|
|
}
|
|
ihk_mc_spinlock_unlock_noirq(&(cpu_local_var(runq_lock)));
|
|
cpu_restore_interrupt(irqstate);
|
|
goto out;
|
|
}
|
|
|
|
if (regs != NULL && !interrupt_from_user(regs)) {
|
|
goto out;
|
|
}
|
|
|
|
if (list_empty(&thread->sigpending) &&
|
|
list_empty(&thread->sigcommon->sigpending)) {
|
|
goto out;
|
|
}
|
|
|
|
for (;;) {
|
|
/* When this function called from check_signal_irq_disabled,
|
|
* return with interrupt invalid.
|
|
* This is to eliminate signal loss.
|
|
*/
|
|
if (irq_disabled == 1) {
|
|
irqstate = cpu_disable_interrupt_save();
|
|
}
|
|
pending = getsigpending(thread, 1);
|
|
if (!pending) {
|
|
dkprintf("check_signal,queue is empty\n");
|
|
goto out;
|
|
}
|
|
if (irq_disabled == 1) {
|
|
cpu_restore_interrupt(irqstate);
|
|
}
|
|
if (do_signal(rc, regs, thread, pending, num)) {
|
|
num = -1;
|
|
}
|
|
}
|
|
|
|
out:
|
|
return;
|
|
}
|
|
|
|
void
|
|
check_signal(unsigned long rc, void *regs0, int num)
|
|
{
|
|
__check_signal(rc, regs0, num, 0);
|
|
}
|
|
|
|
void
|
|
check_signal_irq_disabled(unsigned long rc, void *regs0, int num)
|
|
{
|
|
__check_signal(rc, regs0, num, 1);
|
|
}
|
|
|