/* syscall.c COPYRIGHT FUJITSU LIMITED 2016-2018 */ /** * \file executer/kernel/syscall.c * License details are found in the file LICENSE. * \brief * provide system calls * \author Taku Shimosawa \par * Copyright (C) 2011 - 2012 Taku Shimosawa * \author Balazs Gerofi \par * Copyright (C) 2012 RIKEN AICS * \author Gou Nakamura \par * Copyright (C) 2012 - 2013 Hitachi, Ltd. * \author Tomoki Shirasawa \par * Copyright (C) 2012 - 2013 Hitachi, Ltd. * \author Balazs Gerofi \par * Copyright (C) 2013 The University of Tokyo * \author Gou Nakamura \par * Copyright (C) 2014 RIKEN AICS */ /* * HISTORY: * 2013/11/06 nakamura add shared mapped file * 2013/11/06 nakamura refuse the write to a read-only memory * 2013/09/05 nakamura add mcexec's PTE cleaning to munmap()/mmap(MAP_FIXED) * 2013/08/28 mcexec: upgrade CAP_SYS_RAWIO while do_mmap_pgoff() * 2013/08/09 nakamura support private mapped file * 2013/08/07 nakamura add page fault forwarding * 2013/07/10 rus_vm_fault(): add handling of page absence * 2013/04/17 nakamura add generic system call forwarding */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "config.h" #include "mcctrl.h" #include #include #include /* Compatibility function for vfs_fstat which is not exported in newer kernels */ static inline int mcctrl_vfs_fstat(int fd, struct kstat *stat) { struct file *file; int error; file = fget(fd); if (!file) return -EBADF; error = vfs_getattr(&file->f_path, stat, STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT); fput(file); return error; } #define ALIGN_WAIT_BUF(z) (((z + 63) >> 6) << 6) //#define SC_DEBUG #ifdef SC_DEBUG #define dprintk(...) printk(__VA_ARGS__) #else #define dprintk(...) #endif //#define DEBUG_PTD #ifdef DEBUG_PTD #define pr_ptd(msg, tid, ptd) do { printk("%s: " msg ",tid=%d,refc=%d\n", __FUNCTION__, tid, atomic_read(&ptd->refcount)); } while(0) #else #define pr_ptd(msg, tid, ptd) do { } while(0) #endif //#define DEBUG_PPD #ifdef DEBUG_PPD #define pr_ppd(msg, tid, ppd) do { printk("%s: " msg ",tid=%d,refc=%d\n", __FUNCTION__, tid, atomic_read(&ppd->refcount)); } while(0) #else #define pr_ppd(msg, tid, ppd) do { } while(0) #endif static long pager_call_irq(ihk_os_t os, struct syscall_request *req); static long pager_call(ihk_os_t os, struct syscall_request *req); #ifdef SC_DEBUG static struct ihk_dma_request last_request; static void print_dma_lastreq(void) { printk("SRC OS : %p | %lx\nDESTOS : %p | %lx\n", last_request.src_os, last_request.src_phys, last_request.dest_os, last_request.dest_phys); printk("SIZE : %lx | NOTIFY : %p | PRIV : %p\n", last_request.size, last_request.notify, last_request.priv); } #endif void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd) { if (!atomic_dec_and_test(&ptd->refcount)) { int ret = atomic_read(&ptd->refcount); if (ret < 0) { printk("%s: ERROR: invalid refcount=%d\n", __FUNCTION__, ret); } return; } list_del(&ptd->hash); kfree(ptd); } void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* _ptd) { struct mcctrl_per_proc_data *ppd = _ptd->ppd; struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; int hash = (((uint64_t)_ptd->task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); unsigned long flags; /* Check if data for this thread exists and delete it */ write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { if (ptd_iter->task == _ptd->task) { ptd = ptd_iter; break; } } if (!ptd) { printk("%s: ERROR: ptd not found\n", __FUNCTION__); goto out; } mcctrl_put_per_thread_data_unsafe(ptd); out: write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); } int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data) { struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; struct mcctrl_per_thread_data *ptd_alloc = NULL; int hash = (((uint64_t)current >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); int ret = 0; unsigned long flags; ptd_alloc = kmalloc(sizeof(struct mcctrl_per_thread_data), GFP_ATOMIC); if (!ptd_alloc) { kprintf("%s: error allocate per thread data\n", __FUNCTION__); ret = -ENOMEM; goto out_noalloc; } memset(ptd_alloc, 0, sizeof(struct mcctrl_per_thread_data)); /* Check if data for this thread exists and add if not */ write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { if (ptd_iter->task == current) { ptd = ptd_iter; break; } } if (unlikely(ptd)) { kprintf("%s: WARNING: ptd of tid: %d exists\n", __FUNCTION__, task_pid_vnr(current)); ret = -EBUSY; kfree(ptd_alloc); goto out; } ptd = ptd_alloc; ptd->ppd = ppd; ptd->task = current; ptd->tid = task_pid_vnr(current); ptd->data = data; atomic_set(&ptd->refcount, 1); list_add_tail(&ptd->hash, &ppd->per_thread_data_hash[hash]); out: write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); out_noalloc: return ret; } struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task) { struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); unsigned long flags; /* Check if data for this thread exists */ read_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { if (ptd_iter->task == task) { ptd = ptd_iter; break; } } if (ptd) { if (atomic_read(&ptd->refcount) <= 0) { printk("%s: ERROR: use-after-free detected (%d)", __FUNCTION__, atomic_read(&ptd->refcount)); ptd = NULL; goto out; } atomic_inc(&ptd->refcount); } out: read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); return ptd; } static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet, struct syscall_response *res) { struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct ihk_ikc_channel_desc *c; struct ikc_scd_packet r_packet; int ret = 0; if (!usrdata) { pr_err("%s: error: mcctrl_usrdata not found\n", __func__); return -EINVAL; } c = (usrdata->channels + packet->ref)->c; /* If spinning, no need for IKC message */ if (cmpxchg(&res->req_thread_status, IHK_SCD_REQ_THREAD_SPINNING, IHK_SCD_REQ_THREAD_TO_BE_WOKEN) == IHK_SCD_REQ_THREAD_SPINNING) { dprintk("%s: no need to send IKC message for PID %d\n", __FUNCTION__, packet->pid); return ret; } /* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing. Note that mcexec_terminate_thread() and returning EINTR would compete. */ if (smp_load_acquire(&res->req_thread_status) == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) { printk("%s: INFO: someone else is waking up the McKernel thread, " "pid: %d, req status: %lu, syscall nr: %lu\n", __FUNCTION__, packet->pid, res->req_thread_status, packet->req.number); } /* The thread is not spinning any more, make sure it's descheduled */ if (cmpxchg(&res->req_thread_status, IHK_SCD_REQ_THREAD_DESCHEDULED, IHK_SCD_REQ_THREAD_TO_BE_WOKEN) != IHK_SCD_REQ_THREAD_DESCHEDULED) { printk("%s: WARNING: inconsistent requester status, " "pid: %d, req status: %lu, syscall nr: %lu\n", __FUNCTION__, packet->pid, res->req_thread_status, packet->req.number); dump_stack(); return -EINVAL; } r_packet.msg = SCD_MSG_WAKE_UP_SYSCALL_THREAD; r_packet.ttid = packet->req.rtid; ret = ihk_ikc_send(c, &r_packet, 0); return ret; } long syscall_backward(struct mcctrl_usrdata *usrdata, int num, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5, unsigned long arg6, unsigned long *ret) { struct ikc_scd_packet *packet; struct ikc_scd_packet *free_packet = NULL; struct syscall_request *req; struct syscall_response *resp; unsigned long syscall_ret; struct wait_queue_head_list_node *wqhln; unsigned long irqflags; struct mcctrl_per_proc_data *ppd; struct mcctrl_per_thread_data *ptd; unsigned long phys; struct syscall_request *request = NULL; int retry; request = kmalloc(sizeof(struct syscall_request), GFP_ATOMIC); if (!request) { printk("%s: ERROR: allocating request\n", __func__); syscall_ret = -ENOMEM; goto no_ppd; } request->number = num; request->args[0] = arg1; request->args[1] = arg2; request->args[2] = arg3; request->args[3] = arg4; request->args[4] = arg5; request->args[5] = arg6; /* Look up per-process structure */ ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (!ppd) { kprintf("%s: ERROR: no per-process structure for PID %d??\n", __func__, task_tgid_vnr(current)); syscall_ret = -EINVAL; goto no_ppd; } ptd = mcctrl_get_per_thread_data(ppd, current); if (!ptd) { printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__); syscall_ret = -ENOENT; goto no_ptd; } pr_ptd("get", task_pid_vnr(current), ptd); packet = (struct ikc_scd_packet *)ptd->data; if (!packet) { syscall_ret = -ENOENT; printk("%s: no packet registered for TID %d\n", __FUNCTION__, task_pid_vnr(current)); goto out_put_ppd; } req = &packet->req; /* Map response structure */ phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), packet->resp_pa, sizeof(*resp)); resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp), NULL, 0); retry_alloc: wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC); if (!wqhln) { printk("WARNING: coudln't alloc wait queue head, retrying..\n"); goto retry_alloc; } memset(wqhln, 0, sizeof(struct wait_queue_head_list_node)); /* Prepare per-thread wait queue head */ wqhln->task = current; /* Save the TID explicitly, because mcexec_syscall(), where the request * will be matched, is in IRQ context and can't call task_pid_vnr() */ wqhln->rtid = task_pid_vnr(current); wqhln->req = 0; init_waitqueue_head(&wqhln->wq_syscall); irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); /* Add to exact list */ list_add_tail(&wqhln->list, &ppd->wq_list_exact); ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); resp->stid = task_pid_vnr(current); resp->fault_address = virt_to_phys(request); #define STATUS_IN_PROGRESS 0 #define STATUS_SYSCALL 4 #define __NR_syscall_response 8001 req->valid = 0; if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) { printk("%s: WARNING: failed to notify PID %d\n", __FUNCTION__, packet->pid); } mb(); resp->status = STATUS_SYSCALL; retry = 0; retry_offload: dprintk("%s: tid: %d, syscall: %d SLEEPING\n", __FUNCTION__, task_pid_vnr(current), num); /* wait for response */ syscall_ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); /* debug */ if (syscall_ret == -ERESTARTSYS) { printk("%s: INFO: interrupted by signal\n", __FUNCTION__); retry++; if (retry < 5) { printk("%s: INFO: retry=%d\n", __FUNCTION__, retry); goto retry_offload; } } /* Remove per-thread wait queue head */ irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); list_del(&wqhln->list); ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); dprintk("%s: tid: %d, syscall: %d WOKEN UP\n", __FUNCTION__, task_pid_vnr(current), num); if (retry >= 5) { kfree(wqhln); kprintf("%s: INFO: mcexec is gone or retry count exceeded,pid=%d,ppd=%p,retry=%d\n", __FUNCTION__, task_tgid_vnr(current), ppd, retry); syscall_ret = -EINVAL; goto out; } if (syscall_ret) { kfree(wqhln); printk("%s: ERROR: wait_event_interruptible returned %ld\n", __FUNCTION__, syscall_ret); goto out; } else { unsigned long phys2; struct syscall_response *resp2; /* Note that wqhln->packet is a new packet */ packet = wqhln->packet; free_packet = packet; req = &packet->req; phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), packet->resp_pa, sizeof(*resp)); resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), phys2, sizeof(*resp), NULL, 0); if (resp != resp2) { resp = resp2; phys = phys2; printk("%s: updated new remote PA for resp\n", __FUNCTION__); } } if (!req->valid) { printk("%s:not valid\n", __FUNCTION__); } req->valid = 0; /* check result */ if (req->number != __NR_syscall_response) { printk("%s:unexpected response. %lx %lx\n", __FUNCTION__, req->number, req->args[0]); syscall_ret = -EIO; goto out; } *ret = req->args[1]; kfree(wqhln); syscall_ret = 0; out: /* Release packet sent from McKernel */ if (free_packet) { ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet); } ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp)); ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp)); out_put_ppd: mcctrl_put_per_thread_data(ptd); pr_ptd("put", task_pid_vnr(current), ptd); no_ptd: dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n", __FUNCTION__, task_pid_vnr(current), num, syscall_ret); mcctrl_put_per_proc_data(ppd); no_ppd: kfree(request); return syscall_ret; } #if 0 /* debug */ /* Info of Linux counterpart of migrated-to-Linux thread */ struct host_thread { struct host_thread *next; struct mcos_handler_info *handler; int pid; int tid; unsigned long usp; unsigned long lfs; unsigned long rfs; struct task_struct *task; }; extern struct host_thread *host_threads; extern rwlock_t host_thread_lock; #endif int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason, struct mcctrl_per_proc_data *ppd, struct ikc_scd_packet *packet) { int error; dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n", __FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason); /* Request page fault */ packet->msg = SCD_MSG_REMOTE_PAGE_FAULT; packet->fault_address = (unsigned long)fault_addr; packet->fault_reason = reason; /* packet->target_cpu was set in rus_vm_fault if a thread was found */ error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet, 0, NULL, NULL, 0); if (error < 0) { pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n", __func__, packet->pid, error); } dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n", __func__, task_pid_vnr(current), fault_addr, (unsigned long)reason, error); return error; } /* * By remap_pfn_range(), VM_PFN_AT_MMAP may be raised. * VM_PFN_AT_MMAP cause the following problems. * * 1) vm_pgoff is changed. As a result, i_mmap tree is corrupted. * 2) duplicate free_memtype() calls occur. * * These problems may be solved in linux-3.7. * It uses vm_insert_pfn() until it is fixed. */ #define USE_VM_INSERT_PFN 1 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) #if defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2) static vm_fault_t rus_vm_fault(struct vm_fault *vmf) #else static int rus_vm_fault(struct vm_fault *vmf) #endif { struct vm_area_struct *vma = vmf->vma; #else static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { #endif struct mcctrl_usrdata *usrdata = vma->vm_file->private_data; ihk_device_t dev = ihk_os_to_dev(usrdata->os); unsigned long rpa; unsigned long phys; int error; int try; uint64_t reason; unsigned long pgsize; unsigned long rva; unsigned long pfn; #if USE_VM_INSERT_PFN size_t pix; #endif struct mcctrl_per_proc_data *ppd; struct mcctrl_per_thread_data *ptd; struct task_struct *task = current; struct ikc_scd_packet packet = { }; unsigned long rsysnum = 0; int ret = 0; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) unsigned long addr = vmf->address; #else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */ void __user *addr = vmf->virtual_address; #endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */ /* Look up per-process structure */ ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task)); if (!ppd) { pr_err("%s: INFO: no per-process structure for " "pid %d (tid %d), trying to use pid %d\n", __func__, task_tgid_vnr(task), task_pid_vnr(task), vma->vm_mm->owner->pid); task = vma->vm_mm->owner; ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task)); } if (!ppd) { pr_err("%s: ERROR: no per-process structure for PID %d??\n", __func__, task_tgid_vnr(task)); ret = VM_FAULT_SIGBUS; goto no_ppd; } packet.fault_tid = ppd->pid; ptd = mcctrl_get_per_thread_data(ppd, task); if (ptd) { struct ikc_scd_packet *ptd_packet; pr_ptd("get", task_pid_vnr(task), ptd); ptd_packet = (struct ikc_scd_packet *)ptd->data; if (ptd_packet) { packet.target_cpu = ptd_packet->ref; packet.fault_tid = ptd_packet->req.rtid; rsysnum = ptd_packet->req.number; } mcctrl_put_per_thread_data(ptd); pr_ptd("put", task_pid_vnr(task), ptd); } /* Don't even bother looking up NULL */ if (!addr) { pr_warn("%s: WARNING: attempted NULL pointer access\n", __func__); ret = VM_FAULT_SIGBUS; goto put_and_out; } for (try = 1; ; ++try) { error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable, (unsigned long)addr, &rpa, &pgsize); #define NTRIES 2 if (!error || (try >= NTRIES)) { if (error) { pr_err("%s: error translating 0x%#lx " "(req: TID: %u, syscall: %lu)\n", __func__, (unsigned long)addr, packet.fault_tid, rsysnum); } break; } reason = 0; if (vmf->flags & FAULT_FLAG_WRITE) { #define PF_WRITE 0x02 reason |= PF_WRITE; } error = remote_page_fault(usrdata, (void *)addr, reason, ppd, &packet); if (error) { pr_err("%s: error forwarding PF for 0x%#lx " "(req: TID: %d, syscall: %lu)\n", __func__, (unsigned long)addr, packet.fault_tid, rsysnum); break; } } if (error) { ret = VM_FAULT_SIGBUS; goto put_and_out; } // Force regular page size pgsize = PAGE_SIZE; rva = (unsigned long)addr & ~(pgsize - 1); rpa = rpa & ~(pgsize - 1); phys = ihk_device_map_memory(dev, rpa, pgsize); pfn = phys >> PAGE_SHIFT; #if USE_VM_INSERT_PFN for (pix = 0; pix < (pgsize / PAGE_SIZE); ++pix) { struct page *page; /* LWK may hold large page based mappings that align rva outside * Linux' VMA, make sure we don't try to map to those pages */ if (rva + (pix * PAGE_SIZE) < vma->vm_start || rva + (pix * PAGE_SIZE) > vma->vm_end) { continue; } if (pfn_valid(pfn+pix)) { page = pfn_to_page(pfn+pix); error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page); if (error) { pr_err("%s: error inserting mapping for 0x%#lx " "(req: TID: %d, syscall: %lu) error: %d," " vm_start: 0x%lx, vm_end: 0x%lx, pgsize: %lu, ind: %lu\n", __func__, (unsigned long)addr, packet.fault_tid, rsysnum, error, vma->vm_start, vma->vm_end, pgsize, pix); } } else { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0) error = vmf_insert_pfn(vma, rva+(pix*PAGE_SIZE), pfn+pix); if (error == VM_FAULT_NOPAGE) { dprintk("%s: vmf_insert_pfn returned %d\n", __func__, error); error = 0; } #else error = vm_insert_pfn(vma, rva+(pix*PAGE_SIZE), pfn+pix); #endif } if (error) { pr_err("%s: vm_insert_pfn returned %d\n", __func__, error); if (error == -EBUSY) { error = 0; } else { break; } } } #else error = remap_pfn_range(vma, rva, pfn, pgsize, vma->vm_page_prot); #endif ihk_device_unmap_memory(dev, phys, pgsize); if (error) { pr_err("%s: remote PF failed for 0x%#lx, pgoff: %lu" " (req: TID: %d, syscall: %lu)\n", __func__, (unsigned long)addr, vmf->pgoff, packet.fault_tid, rsysnum); ret = VM_FAULT_SIGBUS; goto put_and_out; } ret = VM_FAULT_NOPAGE; put_and_out: mcctrl_put_per_proc_data(ppd); no_ppd: return ret; } static struct vm_operations_struct rus_vmops = { .fault = &rus_vm_fault, }; static int rus_mmap(struct file *file, struct vm_area_struct *vma) { vm_flags_set(vma, arch_rus_vm_flags); vma->vm_ops = &rus_vmops; return 0; } static struct file_operations rus_fops = { .mmap = &rus_mmap, }; unsigned long reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end) { struct file *file; struct cred *promoted; const struct cred *original; file = anon_inode_getfile("[mckernel]", &rus_fops, usrdata, O_RDWR); if (IS_ERR(file)) { return PTR_ERR(file); } promoted = prepare_creds(); if (!promoted) { printk("mcctrl:user space reservation failed. ENOMEM\n"); fput(file); return -ENOMEM; } /* * CAP_SYS_RAWIO for mmap_min_addr check avoidance */ cap_raise(promoted->cap_effective, CAP_SYS_RAWIO); original = override_creds(promoted); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) start = vm_mmap_pgoff(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_SHARED, 0); #else start = vm_mmap(file, start, end, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_SHARED, 0); #endif #if 0 { /* debug */ struct vm_area_struct *vma; mmap_write_lock(current->mm); vma = find_vma(current->mm, start); vm_flags_set(vma, VM_DONTCOPY); mmap_write_unlock(current->mm); } #endif revert_creds(original); put_cred(promoted); fput(file); if (IS_ERR_VALUE(start)) { printk("mcctrl:user space reservation failed.\n"); } return start; } struct pager { struct list_head list; struct inode * inode; uint64_t ref; /* needs same type as fileobj->sref */ struct file * rofile; struct file * rwfile; uintptr_t map_uaddr; size_t map_len; off_t map_off; }; static DEFINE_SPINLOCK(pager_lock); static struct list_head pager_list = LIST_HEAD_INIT(pager_list); int pager_nr_processes = 0; void pager_add_process(void) { unsigned long flags; spin_lock_irqsave(&pager_lock, flags); ++pager_nr_processes; spin_unlock_irqrestore(&pager_lock, flags); } void pager_remove_process(struct mcctrl_per_proc_data *ppd) { int error; struct pager *pager_next, *pager; unsigned long flags; if (in_atomic() || in_interrupt()) { printk("%s: WARNING: shouldn't be called in IRQ context..\n", __FUNCTION__); return; } /* Clean up device file mappings of this process */ error = down_interruptible(&ppd->devobj_pager_lock); if (error) { return; } list_for_each_entry_safe(pager, pager_next, &ppd->devobj_pager_list, list) { dprintk("%s: devobj pager 0x%p removed\n", __FUNCTION__, pager); list_del(&pager->list); kfree(pager); } up(&ppd->devobj_pager_lock); /* Clean up global pagers for regular file mappings if this * was the last process */ spin_lock_irqsave(&pager_lock, flags); --pager_nr_processes; spin_unlock_irqrestore(&pager_lock, flags); } void pager_cleanup(void) { unsigned long flags; struct pager *pager_next, *pager; spin_lock_irqsave(&pager_lock, flags); list_for_each_entry_safe(pager, pager_next, &pager_list, list) { list_del(&pager->list); if (pager->rofile) { fput(pager->rofile); } if (pager->rwfile) { fput(pager->rwfile); } dprintk("%s: pager 0x%p removed\n", __FUNCTION__, pager); kfree(pager); } spin_unlock_irqrestore(&pager_lock, flags); } struct pager_create_result { uintptr_t handle; int maxprot; uint32_t flags; size_t size; int pgshift; char path[PATH_MAX]; }; enum { /* for memobj.flags */ MF_HAS_PAGER = 0x0001, MF_SHMDT_OK = 0x0002, MF_IS_REMOVABLE = 0x0004, MF_PREFETCH = 0x0008, MF_ZEROFILL = 0x0010, MF_REG_FILE = 0x1000, MF_DEV_FILE = 0x2000, MF_PREMAP = 0x8000, MF_XPMEM = 0x10000, /* To identify XPMEM attachment pages for rusage accounting */ MF_ZEROOBJ = 0x20000, /* To identify pages of anonymous, on-demand paging ranges for rusage accounting */ MF_SHM = 0x40000, MF_HUGETLBFS = 0x100000, MF_PRIVATE = 0x200000, /* To prevent flush in clear_range_* */ }; static int pager_get_path(struct file *file, char *path) { int error = 0; char *pathbuf, *fullpath; pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathbuf) { printk("%s: ERROR: allocating path\n", __FUNCTION__); error = -ENOMEM; goto out; } fullpath = d_path(&file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { memcpy(path, fullpath, strlen(fullpath) + 1); } else { path[0] = 0; } out: if (pathbuf) { kfree(pathbuf); } return error; } static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa) { ihk_device_t dev = ihk_os_to_dev(os); int error; struct pager_create_result *resp; int maxprot = 0; struct file *file = NULL; struct inode *inode; struct pager *pager = NULL; struct pager *newpager = NULL; uintptr_t phys; struct kstat st; int mf_flags = 0; unsigned long irqflags; int pgshift = 0; dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa); error = mcctrl_vfs_fstat(fd, &st); if (error) { printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error); goto out; } if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1) && (MINOR(st.rdev) == 1 || // /dev/mem MINOR(st.rdev) == 5)) { // /dev/zero /* treat memory devices and zero devices as regular files */ } else if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) { error = -ENODEV; dprintk("%s(%d,%lx):unmappable device %x\n", __func__, fd, (long)result_pa, st.mode); goto out; } else if (!S_ISREG(st.mode)) { error = -ESRCH; dprintk("pager_req_create(%d,%lx):not VREG. %x\n", fd, (long)result_pa, st.mode); goto out; } file = fget(fd); if (!file) { error = -EBADF; printk("pager_req_create(%d,%lx):file not found. %d\n", fd, (long)result_pa, error); goto out; } /* Shared memory hack */ { char *pathbuf, *fullpath; pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC); if (pathbuf) { fullpath = d_path(&file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { if (!strncmp("/tmp/ompi.", fullpath, 10) || !strncmp("/dev/shm/", fullpath, 9) || (!strncmp("/var/opt/FJSVtcs/ple/daemonif/", fullpath, 30) && !strstr(fullpath, "dstore_sm.lock"))) { printk("%s: treating %s as a device file..\n", __func__, fullpath); kfree(pathbuf); error = -ESRCH; goto out; } kfree(pathbuf); } } } inode = file->f_path.dentry->d_inode; if (!inode) { error = -EBADF; printk("pager_req_create(%d,%lx):inode not found. %d\n", fd, (long)result_pa, error); goto out; } if (!strcmp(inode->i_sb->s_type->name, "tmpfs")) { mf_flags = MF_IS_REMOVABLE; } if (!strcmp(inode->i_sb->s_type->name, "proc")) { error = -ESRCH; goto out; } if ((file->f_mode & FMODE_READ) && (file->f_mode & FMODE_PREAD)) { maxprot |= PROT_READ; } if ((file->f_mode & FMODE_WRITE) && (file->f_mode & FMODE_PWRITE)) { maxprot |= PROT_WRITE; } if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) { maxprot |= PROT_EXEC; } if (!(maxprot & PROT_READ)) { error = -EACCES; printk("pager_req_create(%d,%lx):cannot read file. %d\n", fd, (long)result_pa, error); goto out; } if (inode->i_op == mcctrl_hugetlbfs_inode_operations) { struct hstate *h = hstate_file(file); pgshift = PAGE_SHIFT + huge_page_order(h); mf_flags = MF_HUGETLBFS; /* pager is used as handle id on mckernel side, use inode */ pager = (void *)st.ino; /* file size is not used */ st.size = 0; goto out_reply; } for (;;) { spin_lock_irqsave(&pager_lock, irqflags); list_for_each_entry(pager, &pager_list, list) { if (pager->inode == inode) { goto found; } } if (newpager) { newpager->inode = inode; newpager->ref = 0; list_add(&newpager->list, &pager_list); pager = newpager; newpager = NULL; /* Shared libraries prefetch */ { char *pathbuf, *fullpath; pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC); if (pathbuf) { fullpath = d_path(&file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { if (strstr(fullpath, ".so")) { mf_flags = MF_PREFETCH; dprintk("%s: filename: %s, prefetch\n", __FUNCTION__, fullpath); } } kfree(pathbuf); } } break; } spin_unlock_irqrestore(&pager_lock, irqflags); newpager = kzalloc(sizeof(*newpager), GFP_ATOMIC); if (!newpager) { error = -ENOMEM; printk("pager_req_create(%d,%lx):kzalloc failed. %d\n", fd, (long)result_pa, error); goto out; } } found: ++pager->ref; if (!pager->rwfile && (maxprot & PROT_WRITE)) { get_file(file); pager->rwfile = file; } else if (!pager->rofile && !(maxprot & PROT_WRITE)) { get_file(file); pager->rofile = file; } spin_unlock_irqrestore(&pager_lock, irqflags); out_reply: phys = ihk_device_map_memory(dev, result_pa, sizeof(*resp)); resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0); if (!resp) { ihk_device_unmap_memory(dev, phys, sizeof(*resp)); printk("%s: ERROR: invalid response structure address\n", __FUNCTION__); error = -EINVAL; goto out; } resp->handle = (uintptr_t)pager; resp->maxprot = maxprot; resp->flags = mf_flags; resp->size = st.size; resp->pgshift = pgshift; error = pager_get_path(file, resp->path); ihk_device_unmap_virtual(dev, resp, sizeof(*resp)); ihk_device_unmap_memory(dev, phys, sizeof(*resp)); out: if (newpager) { kfree(newpager); } if (file) { fput(file); } dprintk("pager_req_create(%d,%lx): %d %p %x\n", fd, (long)result_pa, error, pager, maxprot); return error; } static int pager_req_release(ihk_os_t os, uintptr_t handle, uint64_t sref) { int error; struct pager *p; struct pager *free_pager = NULL; unsigned long flags; dprintk("%s(%p,%lx)\n", __func__, os, handle); spin_lock_irqsave(&pager_lock, flags); error = -EBADF; list_for_each_entry(p, &pager_list, list) { if ((uintptr_t)p == handle) { error = 0; p->ref -= sref; if (p->ref > 0) break; list_del(&p->list); free_pager = p; break; } } spin_unlock_irqrestore(&pager_lock, flags); if (error) { pr_err("%s(%p,%lx):pager not found. %d\n", __func__, os, handle, error); goto out; } if (free_pager) { if (free_pager->rofile) { fput(free_pager->rofile); } if (free_pager->rwfile) { fput(free_pager->rwfile); } kfree(free_pager); } error = 0; out: dprintk("%s(%p,%lx): %d\n", __func__, os, handle, error); return error; } static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size, uintptr_t rpa) { ssize_t ss, n; struct pager *pager; struct file *file = NULL; uintptr_t phys = -1; ihk_device_t dev = ihk_os_to_dev(os); void *buf = NULL; loff_t pos, fsize; unsigned long flags; unsigned int major, minor; dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa); spin_lock_irqsave(&pager_lock, flags); list_for_each_entry(pager, &pager_list, list) { if ((uintptr_t)pager == handle) { file = (pager->rofile)? pager->rofile: pager->rwfile; get_file(file); break; } } spin_unlock_irqrestore(&pager_lock, flags); if (!file) { ss = -EBADF; pr_warn("%s(%lx,%lx,%lx,%lx):pager not found. %ld\n", __func__, handle, off, size, rpa, ss); goto out; } major = MAJOR(file->f_mapping->host->i_rdev); minor = MINOR(file->f_mapping->host->i_rdev); if ((major == 1 && minor == 1) || // /dev/mem (major == 1 && minor == 5)) { // /dev/zero /* Nothing to check */ } else { /* Check if the target page fits in the file */ fsize = i_size_read(file->f_mapping->host); if (off > fsize) { ss = 0; goto out; } } phys = ihk_device_map_memory(dev, rpa, size); buf = ihk_device_map_virtual(dev, phys, size, NULL, 0); if (!buf) { pr_warn("%s: ERROR: invalid buffer address\n", __func__); ss = -EINVAL; goto out; } pos = off; n = 0; while (n < size) { if (pos != off + n) { pr_warn("%s: pos wrong? got %lld, expected %ld\n", __func__, pos, off+n); pos = off + n; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) ss = kernel_read(file, buf + n, size - n, &pos); #else ss = kernel_read(file, pos, buf + n, size - n); pos += ss; #endif if (ss < 0) { break; } if (ss == 0) { memset(buf + n, 0, size - n); n = size; break; } n += ss; } if (ss < 0) { pr_warn("%s(%lx,%lx,%lx,%lx):pread failed. %ld\n", __func__, handle, off, size, rpa, ss); goto out; } ss = n; out: if (buf) { ihk_device_unmap_virtual(dev, buf, size); } if (phys != (uintptr_t)-1) { ihk_device_unmap_memory(dev, phys, size); } if (file) { fput(file); } dprintk("pager_req_read(%lx,%lx,%lx,%lx): %ld\n", handle, off, size, rpa, ss); return ss; } static int pager_req_write(ihk_os_t os, uintptr_t handle, off_t off, size_t size, uintptr_t rpa) { ssize_t ss; struct pager *pager; struct file *file = NULL; uintptr_t phys = -1; ihk_device_t dev = ihk_os_to_dev(os); void *buf = NULL; loff_t pos; loff_t fsize; size_t len; unsigned long flags; dprintk("pager_req_write(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa); spin_lock_irqsave(&pager_lock, flags); list_for_each_entry(pager, &pager_list, list) { if ((uintptr_t)pager == handle) { file = pager->rwfile; break; } } if (file) { get_file(file); } spin_unlock_irqrestore(&pager_lock, flags); if (!file) { ss = -EBADF; printk("pager_req_write(%lx,%lx,%lx,%lx):pager not found. %ld\n", handle, off, size, rpa, ss); goto out; } /* * XXX: Find a way to avoid changing the file size * by using a function in the same abstraction level as kernel_write(). */ fsize = i_size_read(file->f_mapping->host); if (off >= fsize) { ss = 0; goto out; } phys = ihk_device_map_memory(dev, rpa, size); buf = ihk_device_map_virtual(dev, phys, size, NULL, 0); if (!buf) { printk("%s: ERROR: invalid buffer address\n", __FUNCTION__); ss = -EINVAL; goto out; } pos = off; len = size; if ((off + size) > fsize) { len = fsize - off; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) ss = kernel_write(file, buf, len, &pos); #else ss = kernel_write(file, buf, len, pos); #endif if (ss < 0) { printk("pager_req_write(%lx,%lx,%lx,%lx):pwrite failed. %ld\n", handle, off, size, rpa, ss); goto out; } out: if (buf) { ihk_device_unmap_virtual(dev, buf, size); } if (phys != (uintptr_t)-1) { ihk_device_unmap_memory(dev, phys, size); } if (file) { fput(file); } dprintk("pager_req_write(%lx,%lx,%lx,%lx): %ld\n", handle, off, size, rpa, ss); return ss; } struct pager_map_result { uintptr_t handle; int maxprot; int8_t padding[4]; char path[PATH_MAX]; }; static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off, uintptr_t result_rpa, int prot_and_flags) { const ihk_device_t dev = ihk_os_to_dev(os); const off_t pgoff = off / PAGE_SIZE; int error; struct file *file = NULL; uintptr_t va = -1; int maxprot; struct pager *pager = NULL; struct pager_map_result *resp; uintptr_t phys; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct mcctrl_per_proc_data *ppd = NULL; dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa); if (!usrdata) { pr_err("%s: error: mcctrl_usrdata not found\n", __func__); return -EINVAL; } ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (unlikely(!ppd)) { kprintf("%s: ERROR: no per-process structure for PID %d??\n", __FUNCTION__, task_tgid_vnr(current)); return -1; } pager = kzalloc(sizeof(*pager), GFP_ATOMIC); if (!pager) { error = -ENOMEM; printk("pager_req_map(%p,%d,%lx,%lx,%lx):kzalloc failed. %d\n", os, fd, len, off, result_rpa, error); goto out; } file = fget(fd); if (!file) { error = -EBADF; printk("pager_req_map(%p,%d,%lx,%lx,%lx):fget failed. %d\n", os, fd, len, off, result_rpa, error); goto out; } maxprot = 0; if ((file->f_mode & FMODE_READ) && (prot_and_flags ? (prot_and_flags & PROT_READ) : 1)) { maxprot |= PROT_READ; } if ((file->f_mode & FMODE_WRITE) && (prot_and_flags ? (prot_and_flags & PROT_WRITE) : 1)) { maxprot |= PROT_WRITE; } if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC) && (prot_and_flags ? (prot_and_flags & PROT_EXEC) : 1)) { maxprot |= PROT_EXEC; } prot_and_flags = MAP_SHARED | (prot_and_flags & (MAP_POPULATE | MAP_LOCKED)); #define ANY_WHERE 0 if (prot_and_flags & MAP_LOCKED) prot_and_flags |= MAP_POPULATE; /* Shared memory hack */ { char *pathbuf, *fullpath; pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC); if (pathbuf) { fullpath = d_path(&file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { if (!strncmp("/tmp/ompi.", fullpath, 10) || !strncmp("/dev/shm/", fullpath, 9) || !strncmp("/var/opt/FJSVtcs/ple/daemonif/", fullpath, 30)) { dprintk("%s: pre-populating %s..\n", __func__, fullpath); prot_and_flags |= MAP_POPULATE; } kfree(pathbuf); } } } #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) mmap_write_lock(current->mm); va = do_mmap_pgoff(file, ANY_WHERE, len, maxprot, prot_and_flags, pgoff); mmap_write_unlock(current->mm); #else va = vm_mmap(file, ANY_WHERE, len, maxprot, prot_and_flags, pgoff << PAGE_SHIFT); #endif if (IS_ERR_VALUE(va)) { if ((int)va != -ENOTSUPP) { pr_err("%s(%p,%d,%lx,%lx,%lx): " "do_mmap_pgoff failed. %d\n", __func__, os, fd, len, off, result_rpa, (int)va); } error = va; goto out; } pager->ref = 1; pager->map_uaddr = va; pager->map_len = len; pager->map_off = off; dprintk("pager_req_map(%s): 0x%lx - 0x%lx (len: %lu), map_off: %lu\n", file->f_dentry->d_name.name, va, va + len, len, off); phys = ihk_device_map_memory(dev, result_rpa, sizeof(*resp)); resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0); if (!resp) { ihk_device_unmap_memory(dev, phys, sizeof(*resp)); printk("%s: ERROR: invalid response structure address\n", __FUNCTION__); error = -EINVAL; goto out; } resp->handle = (uintptr_t)pager; resp->maxprot = maxprot; error = pager_get_path(file, resp->path); if (error) { goto out_unmap; } error = down_interruptible(&ppd->devobj_pager_lock); if (error) { error = -EINTR; goto out_unmap; } list_add_tail(&pager->list, &ppd->devobj_pager_list); up(&ppd->devobj_pager_lock); pager = 0; error = 0; out_unmap: ihk_device_unmap_virtual(dev, resp, sizeof(*resp)); ihk_device_unmap_memory(dev, phys, sizeof(*resp)); out: if (file) { fput(file); } if (pager) { kfree(pager); } mcctrl_put_per_proc_data(ppd); dprintk("pager_req_map(%p,%d,%lx,%lx,%lx): %d\n", os, fd, len, off, result_rpa, error); return error; } static int pager_req_pfn(ihk_os_t os, uintptr_t handle, off_t off, uintptr_t ppfn_rpa) { const ihk_device_t dev = ihk_os_to_dev(os); struct pager * const pager = (void *)handle; int error; uintptr_t pfn; uintptr_t va; pgd_t *pgd; #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) && defined(CONFIG_X86_64_SMP) p4d_t *p4d; #endif pud_t *pud; pmd_t *pmd; pte_t *pte; uintptr_t phys; uintptr_t *ppfn; int page_fault_attempted = 0; dprintk("pager_req_pfn(%p,%lx,%lx)\n", os, handle, off); if ((off < pager->map_off) || ((pager->map_off+pager->map_len) < (off + PAGE_SIZE))) { error = -ERANGE; pfn = 0; printk("pager_req_pfn(%p,%lx,%lx):out of range. %d [%lx..%lx)\n", os, handle, off, error, pager->map_off, pager->map_off+pager->map_len); goto out; } va = pager->map_uaddr + (off - pager->map_off); #define PFN_VALID ((uintptr_t)1 << 63) pfn = PFN_VALID; /* Use "not present" as the default setting */ mmap_read_lock(current->mm); retry: pgd = pgd_offset(current->mm, va); if (!pgd_none(*pgd) && !pgd_bad(*pgd) && pgd_present(*pgd)) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) && defined(CONFIG_X86_64_SMP) p4d = p4d_offset(pgd, va); if (!p4d_none(*p4d) && !p4d_bad(*p4d) && p4d_present(*p4d)) { pud = pud_offset(p4d, va); #else pud = pud_offset(pgd, va); #endif if (!pud_none(*pud) && !pud_bad(*pud) && pud_present(*pud)) { pmd = pmd_offset(pud, va); if (!pmd_none(*pmd) && !pmd_bad(*pmd) && pmd_present(*pmd)) { pte = pte_offset_kernel(pmd, va); if (!pte_none(*pte) && pte_present(*pte)) { pfn = (uintptr_t)pte_pfn(*pte) << PAGE_SHIFT; #define PFN_PRESENT ((uintptr_t)1 << 0) pfn |= PFN_VALID | PFN_PRESENT; /* Check if mapping is write-combined */ if (pte_is_write_combined(*pte)) { pfn |= PFN_WRITE_COMBINED; } } pte_unmap(pte); } } #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 12, 0) && defined(CONFIG_X86_64_SMP) } #endif } /* If not present, try to fault it */ if (!(pfn & PFN_PRESENT) && !page_fault_attempted) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; struct vm_area_struct *vma; int fault; #if defined(FAULT_FLAG_USER) flags |= FAULT_FLAG_USER; #endif vma = find_vma(current->mm, va); if (!vma || (va < vma->vm_start)) { printk("%s: couldn't find VMA for va %lx\n", __FUNCTION__, va); error = -EINVAL; goto out_release; } #if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 1, 0) fault = handle_mm_fault(vma, va, flags, NULL); #elif LINUX_VERSION_CODE >= KERNEL_VERSION(4, 8, 0) || \ (defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5)) fault = handle_mm_fault(vma, va, flags); #else fault = handle_mm_fault(current->mm, vma, va, flags); #endif #ifdef SC_DEBUG if (fault != 0) { char *pathbuf = NULL; char *fullpath; if (vma->vm_file) { pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC); if (pathbuf) { fullpath = d_path(&vma->vm_file->f_path, pathbuf, PATH_MAX); if (!IS_ERR(fullpath)) { printk("%s: WARNING: couldn't fault 0x%lx" " at off: %lu in %s\n", __FUNCTION__, va, off, fullpath); } kfree(pathbuf); } } } #endif page_fault_attempted = 1; goto retry; } out_release: mmap_read_unlock(current->mm); phys = ihk_device_map_memory(dev, ppfn_rpa, sizeof(*ppfn)); ppfn = ihk_device_map_virtual(dev, phys, sizeof(*ppfn), NULL, 0); if (!ppfn) { printk("%s: ERROR: invalid PFN address\n", __FUNCTION__); error = -EINVAL; goto out; } *ppfn = pfn; ihk_device_unmap_virtual(dev, ppfn, sizeof(*ppfn)); ihk_device_unmap_memory(dev, phys, sizeof(*ppfn)); error = 0; out: dprintk("pager_req_pfn(%p,%lx,%lx): %d %lx\n", os, handle, off, error, pfn); return error; } static int __pager_unmap(struct pager *pager) { int error; #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) mmap_write_lock(current->mm); error = do_munmap(current->mm, pager->map_uaddr, pager->map_len); mmap_write_unlock(current->mm); #else error = vm_munmap(pager->map_uaddr, pager->map_len); #endif if (error) { printk("%s: WARNING: munmap failed for pager 0x%lx: %d\n", __FUNCTION__, (uintptr_t)pager, error); } return error; } static int pager_req_unmap(ihk_os_t os, uintptr_t handle) { struct pager * const pager = (void *)handle; int error; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); struct mcctrl_per_proc_data *ppd = NULL; dprintk("pager_req_unmap(%p,%lx)\n", os, handle); if (!usrdata) { pr_err("%s: error: mcctrl_usrdata not found\n", __func__); return -EINVAL; } ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (unlikely(!ppd)) { kprintf("%s: ERROR: no per-process structure for PID %d??\n", __FUNCTION__, task_tgid_vnr(current)); return -1; } error = down_interruptible(&ppd->devobj_pager_lock); if (error) { error = -EINTR; goto out; } list_del(&pager->list); up(&ppd->devobj_pager_lock); error = __pager_unmap(pager); kfree(pager); out: mcctrl_put_per_proc_data(ppd); return error; } static long pager_req_mlock_list(ihk_os_t os, unsigned long start, unsigned long end, void *addr, int nent) { struct addrpair { unsigned long start; unsigned long end; unsigned long flag; } *addrpair = (struct addrpair *) addr; int cnt = 0; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; kprintf("pager_req_mlock_list: addr(%p)\n", addr); /* Use find_vma to iterate through VMAs */ vma = find_vma(mm, start); while (vma != NULL) { if (vma->vm_start > end) break; kprintf("\t%p: %p -- %p\t%lx\n", vma, (void*)vma->vm_start, (void*)vma->vm_end, vma->vm_flags & VM_LOCKED); if (vma->vm_flags & VM_LOCKED) { kprintf("\t locked\n"); if (++cnt >= nent) { /* last entry is a marker */ addrpair->start = (unsigned long) -1; goto full; } addrpair->start = vma->vm_start; addrpair->end = vma->vm_end; addrpair->flag = vma->vm_flags; addrpair++; } /* Use find_vma to get next VMA */ vma = find_vma(mm, vma->vm_end); } full: return cnt; } #define PAGER_REQ_CREATE 0x0001 #define PAGER_REQ_RELEASE 0x0002 #define PAGER_REQ_READ 0x0003 #define PAGER_REQ_WRITE 0x0004 #define PAGER_REQ_MAP 0x0005 #define PAGER_REQ_PFN 0x0006 #define PAGER_REQ_UNMAP 0x0007 #define PAGER_REQ_MLOCK_LIST 0x0008 static long pager_call_irq(ihk_os_t os, struct syscall_request *req) { long ret = -ENOSYS; switch (req->args[0]) { case PAGER_REQ_RELEASE: ret = pager_req_release(os, req->args[1], req->args[2]); break; } return ret; } static long pager_call(ihk_os_t os, struct syscall_request *req) { long ret; dprintk("pager_call(%#lx)\n", req->args[0]); switch (req->args[0]) { case PAGER_REQ_CREATE: ret = pager_req_create(os, req->args[1], req->args[2]); break; case PAGER_REQ_READ: ret = pager_req_read(os, req->args[1], req->args[2], req->args[3], req->args[4]); break; case PAGER_REQ_WRITE: ret = pager_req_write(os, req->args[1], req->args[2], req->args[3], req->args[4]); break; case PAGER_REQ_MAP: ret = pager_req_map(os, req->args[1], req->args[2], req->args[3], req->args[4], req->args[5]); break; case PAGER_REQ_PFN: ret = pager_req_pfn(os, req->args[1], req->args[2], req->args[3]); break; case PAGER_REQ_UNMAP: ret = pager_req_unmap(os, req->args[1]); break; case PAGER_REQ_MLOCK_LIST: ret = pager_req_mlock_list(os, (unsigned long) req->args[1], (unsigned long) req->args[2], (void*) req->args[3], (int) req->args[4]); break; default: ret = -ENOSYS; printk("pager_call(%#lx):unknown req %ld\n", req->args[0], ret); break; } dprintk("pager_call(%#lx): %ld\n", req->args[0], ret); return ret; } #ifdef ENABLE_TOFU struct list_head mcctrl_file_to_pidfd_hash[MCCTRL_FILE_2_PIDFD_HASH_SIZE]; spinlock_t mcctrl_file_to_pidfd_hash_lock; void mcctrl_file_to_pidfd_hash_init(void) { int hash; spin_lock_init(&mcctrl_file_to_pidfd_hash_lock); for (hash = 0; hash < MCCTRL_FILE_2_PIDFD_HASH_SIZE; ++hash) { INIT_LIST_HEAD(&mcctrl_file_to_pidfd_hash[hash]); } } int mcctrl_file_to_pidfd_hash_insert(struct file *filp, ihk_os_t os, int pid, struct task_struct *group_leader, int fd, char *path, void *pde_data) { unsigned long irqflags; struct mcctrl_file_to_pidfd *file2pidfd_iter; struct mcctrl_file_to_pidfd *file2pidfd; int hash = (int)((unsigned long)filp & (unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK); int ret = 0; file2pidfd = kmalloc(sizeof(*file2pidfd), GFP_ATOMIC); if (!file2pidfd) return -ENOMEM; file2pidfd->filp = filp; file2pidfd->os = os; file2pidfd->pid = pid; file2pidfd->group_leader = group_leader; file2pidfd->fd = fd; /* Only copy the name under /proc/tofu/dev/ */ strncpy(file2pidfd->tofu_dev_path, path + 15, 128); file2pidfd->pde_data = pde_data; spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags); list_for_each_entry(file2pidfd_iter, &mcctrl_file_to_pidfd_hash[hash], hash) { if (file2pidfd_iter->filp == filp) { printk("%s: WARNING: filp: %p, pid: %d, fd: %d exists\n", __func__, filp, pid, fd); ret = -EBUSY; goto free_out; } } list_add_tail(&file2pidfd->hash, &mcctrl_file_to_pidfd_hash[hash]); dprintk("%s: filp: %p, pid: %d, fd: %d added\n", __func__, filp, pid, fd); spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags); return ret; free_out: kfree(file2pidfd); spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags); return ret; } /* * XXX: lookup relies on group_leader to identify the process * because PIDs might be different across name spaces (e.g., * when using Docker) */ struct mcctrl_file_to_pidfd *mcctrl_file_to_pidfd_hash_lookup( struct file *filp, struct task_struct *group_leader) { unsigned long irqflags; struct mcctrl_file_to_pidfd *file2pidfd_iter; struct mcctrl_file_to_pidfd *file2pidfd = NULL; int hash = (int)((unsigned long)filp & (unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK); spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags); list_for_each_entry(file2pidfd_iter, &mcctrl_file_to_pidfd_hash[hash], hash) { if (file2pidfd_iter->filp == filp && file2pidfd_iter->group_leader == group_leader) { file2pidfd = file2pidfd_iter; dprintk("%s: filp: %p, pid: %d, fd: %d found\n", __func__, filp, file2pidfd->pid, file2pidfd->fd); break; } } spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags); return file2pidfd; } int mcctrl_file_to_pidfd_hash_remove(struct file *filp, ihk_os_t os, struct task_struct *group_leader, int fd) { unsigned long irqflags; struct mcctrl_file_to_pidfd *file2pidfd_iter; int hash = (int)((unsigned long)filp & (unsigned long)MCCTRL_FILE_2_PIDFD_HASH_MASK); int ret = 0; spin_lock_irqsave(&mcctrl_file_to_pidfd_hash_lock, irqflags); list_for_each_entry(file2pidfd_iter, &mcctrl_file_to_pidfd_hash[hash], hash) { if (file2pidfd_iter->filp != filp) continue; if (file2pidfd_iter->os != os) continue; if (file2pidfd_iter->group_leader != group_leader) continue; if (file2pidfd_iter->fd != fd) continue; list_del(&file2pidfd_iter->hash); dprintk("%s: filp: %p, pid: %d, fd: %d removed\n", __func__, filp, file2pidfd_iter->pid, fd); kfree(file2pidfd_iter); goto unlock_out; } dprintk("%s: filp: %p, pid: %d, fd: %d couldn't be found\n", __func__, filp, pid, fd); ret = -ENOENT; unlock_out: spin_unlock_irqrestore(&mcctrl_file_to_pidfd_hash_lock, irqflags); return ret; } #endif void __return_syscall(ihk_os_t os, struct mcctrl_per_proc_data *ppd, struct ikc_scd_packet *packet, long ret, int stid) { unsigned long phys; struct syscall_response *res; if (!os || ihk_host_validate_os(os) || !packet) { return; } phys = ihk_device_map_memory(ihk_os_to_dev(os), packet->resp_pa, sizeof(*res)); if (!phys) { return; } res = ihk_device_map_virtual(ihk_os_to_dev(os), phys, sizeof(*res), NULL, 0); if (!res) { printk("%s: ERROR: invalid response structure address\n", __FUNCTION__); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, sizeof(*res)); return; } /* Map response structure and notify offloading thread */ res->ret = ret; res->stid = stid; #ifdef ENABLE_TOFU /* Tofu enabled process? */ if (ppd && ppd->enable_tofu) { char *pathbuf, *fullpath; /* Record PDE_DATA after open() calls for Tofu driver */ if (packet->req.number == __NR_openat && ret > 1) { struct fd f; int fd; fd = ret; f = fdget(fd); if (!f.file) { goto out_notify; } pathbuf = (char *)__get_free_page(GFP_ATOMIC); if (!pathbuf) { goto out_fdput_open; } fullpath = d_path(&f.file->f_path, pathbuf, PAGE_SIZE); if (IS_ERR(fullpath)) { goto out_free_open; } if (!strncmp("/proc/tofu/dev/", fullpath, 15)) { res->pde_data = PDE_DATA(file_inode(f.file)); dprintk("%s: fd: %d, path: %s, PDE_DATA: 0x%lx\n", __func__, fd, fullpath, (unsigned long)res->pde_data); dprintk("%s: pgd_index: %ld, pmd_index: %ld, pte_index: %ld\n", __func__, pgd_index((unsigned long)res->pde_data), pmd_index((unsigned long)res->pde_data), pte_index((unsigned long)res->pde_data)); dprintk("MAX_USER_VA_BITS: %d, PGDIR_SHIFT: %d\n", MAX_USER_VA_BITS, PGDIR_SHIFT); mcctrl_file_to_pidfd_hash_insert(f.file, os, task_tgid_vnr(current), current->group_leader, fd, fullpath, res->pde_data); } out_free_open: free_page((unsigned long)pathbuf); out_fdput_open: fdput(f); } /* Ioctl on Tofu CQ? */ else if (packet->req.number == __NR_ioctl && packet->req.args[0] > 0 && ret == 0) { struct fd f; int fd; int tni, cq; long __ret; fd = packet->req.args[0]; f = fdget(fd); if (!f.file) { goto out_notify; } pathbuf = (char *)__get_free_page(GFP_ATOMIC); if (!pathbuf) { goto out_fdput_ioctl; } fullpath = d_path(&f.file->f_path, pathbuf, PAGE_SIZE); if (IS_ERR(fullpath)) { goto out_free_ioctl; } /* Looking for /proc/tofu/dev/tniXcqY pattern */ __ret = sscanf(fullpath, "/proc/tofu/dev/tni%dcq%d", &tni, &cq); if (__ret == 2) { extern long __mcctrl_tof_utofu_unlocked_ioctl_cq(void *pde_data, unsigned int cmd, unsigned long arg); dprintk("%s: ioctl(): fd: %d, path: %s\n", __func__, fd, fullpath); __ret = __mcctrl_tof_utofu_unlocked_ioctl_cq( PDE_DATA(file_inode(f.file)), packet->req.args[1], packet->req.args[2]); } out_free_ioctl: free_page((unsigned long)pathbuf); out_fdput_ioctl: fdput(f); } } out_notify: #endif if (__notify_syscall_requester(os, packet, res) < 0) { printk("%s: WARNING: failed to notify PID %d\n", __FUNCTION__, packet->pid); } mb(); res->status = 1; ihk_device_unmap_virtual(ihk_os_to_dev(os), res, sizeof(*res)); ihk_device_unmap_memory(ihk_os_to_dev(os), phys, sizeof(*res)); } static int remap_user_space(uintptr_t rva, size_t len, int prot) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct file *file; uintptr_t start; pgoff_t pgoff; uintptr_t map; dprintk("remap_user_space(%lx,%lx,%x)\n", rva, len, prot); mmap_write_lock(mm); vma = find_vma(mm, rva); if (!vma || (rva < vma->vm_start)) { printk("remap_user_space(%lx,%lx,%x):find_vma failed. %p %lx %lx\n", rva, len, prot, vma, (vma)? vma->vm_start: -1, (vma)? vma->vm_end: 0); mmap_write_unlock(mm); map = -ENOMEM; goto out; } file = vma->vm_file; start = rva; pgoff = vma->vm_pgoff + ((rva - vma->vm_start) >> PAGE_SHIFT); #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) map = do_mmap_pgoff(file, start, len, prot, MAP_FIXED|MAP_SHARED, pgoff); #endif mmap_write_unlock(mm); #if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) map = vm_mmap(file, start, len, prot, MAP_FIXED|MAP_SHARED, pgoff << PAGE_SHIFT); #endif out: dprintk("remap_user_space(%lx,%lx,%x): %lx (%ld)\n", rva, len, prot, (long)map, (long)map); return (IS_ERR_VALUE(map))? (int)map: 0; } int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; uintptr_t addr; uintptr_t end; #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) int error; #endif int ret; ret = 0; mmap_read_lock(mm); addr = start; while (addr < (start + len)) { vma = find_vma(mm, addr); if (!vma) { break; } if (addr < vma->vm_start) { addr = vma->vm_start; } end = start + len; if (vma->vm_end < end) { end = vma->vm_end; } if (addr < end) { #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) /* Revert permission */ vm_flags_set(vma, VM_READ | VM_WRITE | VM_EXEC); error = zap_vma_ptes(vma, addr, end-addr); if (error) { mcctrl_zap_page_range(vma, addr, end-addr, NULL); error = 0; } if (ret == 0) { ret = error; } #else /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */ if (addr < vma->vm_start || addr + end-addr > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) { mcctrl_zap_page_range(vma, addr, end-addr, NULL); } else { /* Revert permission */ vm_flags_set(vma, VM_READ | VM_WRITE | VM_EXEC); zap_vma_ptes(vma, addr, end-addr); } #endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */ } addr = end; } mmap_read_unlock(mm); return ret; } int release_user_space(uintptr_t start, uintptr_t len) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; uintptr_t addr; uintptr_t end; int error; int ret; ret = 0; //down_read(&mm->mmap_sem); addr = start; while (addr < (start + len)) { vma = find_vma(mm, addr); if (!vma) { break; } if (addr < vma->vm_start) { addr = vma->vm_start; } end = vma->vm_end; if (addr < end) { if ((error = vm_munmap(addr, end - addr))) { printk("%s: ERROR: vm_munmap failed (%d)\n", __func__, error); } if (ret == 0) { ret = error; } } addr = vma->vm_end; } //up_read(&mm->mmap_sem); return ret; } /** * \brief Write out the core file image to a core file. * * \param os An ihk_os_t structure. * \param rcoretable The physical address of remote's coretable. * \param chunks The number of chunks which make a core file image in the whole. */ static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks, unsigned long cmdline_rphys, unsigned long cmdline_len) { char *fn = NULL; struct file *file; struct coretable *coretable; int i, tablesize, error = 0; loff_t size; ssize_t ret; unsigned long phys, tablephys, rphys; ihk_device_t dev = ihk_os_to_dev(os); char *pt; unsigned long cmdline_phys; char *cmdline; dprintk("coredump called as a pseudo syscall\n"); fn = kmalloc(PATH_MAX, GFP_ATOMIC); if (!fn) { dprintk("%s: ERROR: allocating file name\n", __func__); error = -ENOMEM; goto fail; } if (chunks <= 0) { dprintk("no core data found!(%d)\n", chunks); error = -EINVAL; goto fail; } cmdline_phys = ihk_device_map_memory(dev, cmdline_rphys, cmdline_len); cmdline = ihk_device_map_virtual(dev, cmdline_phys, cmdline_len, NULL, 0); sprintf(fn, "mccore-%s.%d", strrchr(cmdline, '/') ? strrchr(cmdline, '/') + 1 : cmdline, task_tgid_vnr(current)); pr_info("%s: fn=%s\n", __func__, fn); ihk_device_unmap_virtual(dev, cmdline, cmdline_len); ihk_device_unmap_memory(dev, cmdline_phys, cmdline_len); /* Every Linux documentation insists we should not * open a file in the kernel module, but our karma * leads us here. Precisely, Here we emulate the core * dump routine of the Linux kernel in linux/fs/exec.c. * So we have a legitimate reason to do this. */ file = filp_open(fn, O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600); if (IS_ERR(file) || !file->f_op) { dprintk("cannot open core file\n"); error = PTR_ERR(file); goto fail; } /* first we map the chunk table */ tablesize = sizeof(struct coretable) * chunks; tablephys = ihk_device_map_memory(dev, rcoretable, tablesize); coretable = ihk_device_map_virtual(dev, tablephys, tablesize, NULL, 0); for (i = 0; i < chunks; i++) { /* map and write the chunk out */ rphys = coretable[i].addr; size = coretable[i].len; if (rphys != 0) { dprintk("mapping remote %x@%lx -> ", size, rphys); phys = ihk_device_map_memory(dev, rphys, size); dprintk("physical %lx, ", phys); pt = ihk_device_map_virtual(dev, phys, size, NULL, 0); if (pt == NULL) { pt = phys_to_virt(phys); } dprintk("virtual %p\n", pt); if (pt != NULL) { #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0) ret = kernel_write(file, pt, size, &file->f_pos); #else ret = kernel_write(file, pt, size, file->f_pos); file->f_pos += ret; #endif } else { dprintk("cannot map physical memory(%lx) to virtual memory.\n", phys); ihk_device_unmap_memory(dev, phys, size); break; } /* unmap the chunk */ ihk_device_unmap_virtual(dev, pt, size); ihk_device_unmap_memory(dev, phys, size); if (ret != size) { dprintk("core file write failed(%ld).\n", ret); error = PTR_ERR(file); break; } } else { /* We skip if the physical address is NULL and make the core file sparse. */ if (!file->f_op->llseek || (file->f_op->llseek == no_llseek)) { dprintk("We have no llseek. The core file is truncated.\n"); error = -EINVAL; } ret = file->f_op->llseek(file, size, SEEK_CUR); if (ret < 0) { dprintk("core file seek failed(%ld).\n", ret); error = PTR_ERR(file); break; } } } /* unmap the chunk table */ ihk_device_unmap_virtual(dev, coretable, tablesize); ihk_device_unmap_memory(dev, tablephys, tablesize); filp_close(file, NULL); fail: if (error == -ENOSYS) { /* make sure we do not travel to user land */ error = -EINVAL; } kfree(fn); return error; } #define SCHED_CHECK_SAME_OWNER 0x01 #define SCHED_CHECK_ROOT 0x02 int __do_in_kernel_irq_syscall(ihk_os_t os, struct ikc_scd_packet *packet) { struct syscall_request *sc = &packet->req; int ret; switch (sc->number) { case __NR_mmap: ret = pager_call_irq(os, sc); break; default: ret = -ENOSYS; } if (ret == -ENOSYS) return -ENOSYS; __return_syscall(os, NULL, packet, ret, 0); return 0; } /* * Memory clearing helpers. */ struct node_distance; #define IHK_RBTREE_ALLOCATOR #ifdef IHK_RBTREE_ALLOCATOR struct free_chunk { unsigned long addr, size; struct rb_node node; struct llist_node list; }; #endif typedef struct mcs_lock_node { #ifndef SPIN_LOCK_IN_MCS unsigned long locked; struct mcs_lock_node *next; #endif unsigned long irqsave; #ifdef SPIN_LOCK_IN_MCS ihk_spinlock_t spinlock; #endif #ifndef ENABLE_UBSAN } __aligned(64) mcs_lock_node_t; #else } mcs_lock_node_t; #endif struct ihk_mc_numa_node { int id; int linux_numa_id; int type; struct list_head allocators; struct node_distance *nodes_by_distance; #ifdef IHK_RBTREE_ALLOCATOR atomic_t zeroing_workers; atomic_t nr_to_zero_pages; struct llist_head zeroed_list; struct llist_head to_zero_list; struct rb_root free_chunks; mcs_lock_node_t lock; unsigned long nr_pages; /* * nr_free_pages: all freed pages, zeroed if zero_at_free */ unsigned long nr_free_pages; unsigned long min_addr; unsigned long max_addr; #endif }; void mcctrl_zero_mckernel_pages(unsigned long arg) { struct llist_node *llnode; struct ihk_mc_numa_node *node = (struct ihk_mc_numa_node *)arg; /* Iterate free chunks */ while ((llnode = llist_del_first(&node->to_zero_list))) { unsigned long addr; unsigned long size; struct free_chunk *chunk = container_of(llnode, struct free_chunk, list); addr = chunk->addr; size = chunk->size; memset(phys_to_virt(addr) + sizeof(*chunk), 0, chunk->size - sizeof(*chunk)); llist_add(&chunk->list, &node->zeroed_list); dprintk("%s: zeroed %lu pages @ McKernel NUMA %d (chunk: 0x%lx:%lu)\n", __func__, size >> PAGE_SHIFT, node->id, addr, size); barrier(); atomic_sub((int)(size >> PAGE_SHIFT), &node->nr_to_zero_pages); } atomic_dec(&node->zeroing_workers); } int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet) { struct syscall_request *sc = &packet->req; int error; long ret = -1; dprintk("%s: system call: %lx\n", __FUNCTION__, sc->args[0]); switch (sc->number) { #ifdef ENABLE_TOFU case __NR_close: { struct fd f; int fd; fd = (int)sc->args[0]; if (fd > 2) { f = fdget(fd); if (f.file) { mcctrl_file_to_pidfd_hash_remove(f.file, os, current->group_leader, fd); fdput(f); } } error = -ENOSYS; goto out; break; } #endif case __NR_mmap: ret = pager_call(os, sc); break; case __NR_munmap: ret = mcctrl_clear_pte_range(sc->args[0], sc->args[1]); break; case __NR_mprotect: ret = remap_user_space(sc->args[0], sc->args[1], sc->args[2]); break; case __NR_move_pages: /* * move pages is used for zeroing McKernel side memory, * this call is NOT offloaded by applications. */ mcctrl_zero_mckernel_pages(sc->args[0]); goto out_no_syscall_return; case __NR_exit_group: { /* Make sure the user space handler will be called as well */ error = -ENOSYS; goto out; } case __NR_coredump: ret = writecore(os, sc->args[1], sc->args[0], sc->args[2], sc->args[3]); break; case __NR_sched_setparam: { switch (sc->args[0]) { case SCHED_CHECK_SAME_OWNER: { const struct cred *cred = current_cred(); const struct cred *pcred; bool match; struct task_struct *p; int pid = sc->args[1]; rcu_read_lock(); p = pid_task(find_get_pid(pid), PIDTYPE_PID); if (!p) { rcu_read_unlock(); ret = -ESRCH; goto sched_setparam_out; } rcu_read_unlock(); rcu_read_lock(); pcred = __task_cred(p); #if LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0) match = (uid_eq(cred->euid, pcred->euid) || uid_eq(cred->euid, pcred->uid)); #else match = ((cred->euid == pcred->euid) || (cred->euid == pcred->uid)); #endif rcu_read_unlock(); if (match) { ret = 0; } else { ret = -EPERM; } break; } case SCHED_CHECK_ROOT: { const struct cred *cred = current_cred(); bool match; #if LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0) match = uid_eq(cred->euid, GLOBAL_ROOT_UID); #else match = (cred->euid == 0); #endif if (match) { ret = 0; } else { ret = -EPERM; } break; } } sched_setparam_out: break; } default: error = -ENOSYS; goto out; break; } __return_syscall(os, NULL, packet, ret, 0); out_no_syscall_return: ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet); error = 0; out: dprintk("%s: system call: %ld, args[0]: %lx, error: %d, ret: %ld\n", __FUNCTION__, sc->number, sc->args[0], error, ret); return error; }