diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index 00ffd3dd..0a65e782 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -1054,9 +1054,8 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs) unsigned long error = ((struct x86_user_context *)regs)->gpr.error; irqflags = kprintf_lock(); - dkprintf("[%d] Page fault for 0x%lX\n", - ihk_mc_get_processor_id(), address); - dkprintf("%s for %s access in %s mode (reserved bit %s set), " + __kprintf("Page fault for 0x%lx\n", address); + __kprintf("%s for %s access in %s mode (reserved bit %s set), " "it %s an instruction fetch\n", (error & PF_PROT ? "protection fault" : "no page found"), (error & PF_WRITE ? "write" : "read"), @@ -1068,14 +1067,14 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs) list_for_each_entry(range, &vm->vm_range_list, list) { if (range->start <= address && range->end > address) { found = 1; - dkprintf("address is in range, flag: 0x%X! \n", + __kprintf("address is in range, flag: 0x%lx\n", range->flag); ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address); break; } } if (!found) { - dkprintf("address is out of range! \n"); + __kprintf("address is out of range! \n"); } kprintf_unlock(irqflags); diff --git a/arch/x86/kernel/include/arch-memory.h b/arch/x86/kernel/include/arch-memory.h index 2279720e..47ebbb40 100644 --- a/arch/x86/kernel/include/arch-memory.h +++ b/arch/x86/kernel/include/arch-memory.h @@ -318,5 +318,5 @@ extern unsigned long ap_trampoline; #define AP_TRAMPOLINE_SIZE 0x2000 /* Local is cachable */ -#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE) +#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE) #endif diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 79aac667..08c23cee 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -23,6 +23,7 @@ #include #include #include +#include #define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) #define ekprintf(...) kprintf(__VA_ARGS__) @@ -84,20 +85,22 @@ void ihk_mc_free_pages(void *p, int npages) pa_ops->free_page(p, npages); } -void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag) +void *ihk_mc_allocate(int size, int flag) { - if (pa_ops && pa_ops->alloc) - return pa_ops->alloc(size, flag); - else - return ihk_mc_alloc_pages(1, flag); + if (!cpu_local_var(kmalloc_initialized)) { + kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__); + return NULL; + } + return kmalloc(size, IHK_MC_AP_NOWAIT); } void ihk_mc_free(void *p) { - if (pa_ops && pa_ops->free) - return pa_ops->free(p); - else - return ihk_mc_free_pages(p, 1); + if (!cpu_local_var(kmalloc_initialized)) { + kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__); + return; + } + kfree(p); } void *get_last_early_heap(void) @@ -1111,6 +1114,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, if (!(old & PFL1_FILEOFF) && args->free_physical) { if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), 1); + dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base); } args->vm->currss -= PTL1_SIZE; } @@ -1159,6 +1163,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, if (!(old & PFL2_FILEOFF) && args->free_physical) { if (page && page_unmap(page)) { ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE); + dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base); } args->vm->currss -= PTL2_SIZE; } @@ -2273,6 +2278,9 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t reason = PF_USER; /* page not present */ for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) { + if (!addr) + return -EINVAL; + error = page_fault_process_vm(vm, (void *)addr, reason); if (error) { kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr); diff --git a/arch/x86/kernel/mikc.c b/arch/x86/kernel/mikc.c index cfde96da..eff3616d 100644 --- a/arch/x86/kernel/mikc.c +++ b/arch/x86/kernel/mikc.c @@ -38,7 +38,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel, arch_master_channel_packet_handler = packet_handler; ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq, - ihk_ikc_master_channel_packet_handler); + ihk_ikc_master_channel_packet_handler, channel); ihk_ikc_enable_channel(channel); /* Set boot parameter */ diff --git a/arch/x86/kernel/perfctr.c b/arch/x86/kernel/perfctr.c index b2c2cd09..1b1ab99d 100644 --- a/arch/x86/kernel/perfctr.c +++ b/arch/x86/kernel/perfctr.c @@ -105,7 +105,7 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value) wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value); //kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0); - kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value); + //kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value); return 0; } diff --git a/arch/x86/kernel/syscall.c b/arch/x86/kernel/syscall.c index e84618e5..4f4b2c8f 100644 --- a/arch/x86/kernel/syscall.c +++ b/arch/x86/kernel/syscall.c @@ -293,7 +293,7 @@ SYSCALL_DECLARE(rt_sigreturn) extern struct cpu_local_var *clv; extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont); -extern void interrupt_syscall(int all, int pid); +extern void interrupt_syscall(int pid, int tid); extern int num_processors; #define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \ @@ -1290,7 +1290,7 @@ done: cpu_restore_interrupt(irqstate); if (doint && !(mask & tthread->sigmask.__val[0])) { - int cpuid = tthread->cpu_id; + int tid = tthread->tid; int pid = tproc->pid; int status = tthread->status; @@ -1301,7 +1301,7 @@ done: } if(!tthread->proc->nohost) - interrupt_syscall(pid, cpuid); + interrupt_syscall(pid, tid); if (status != PS_RUNNING) { if(sig == SIGKILL){ @@ -1437,9 +1437,8 @@ SYSCALL_DECLARE(mmap) goto out; } - if ((addr < region->user_start) - || (region->user_end <= addr) - || ((region->user_end - addr) < len)) { + if ((flags & MAP_FIXED) && ((addr < region->user_start) + || (region->user_end <= addr))) { ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n", addr0, len0, prot, flags0, fd, off0); error = -ENOMEM; diff --git a/arch/x86/tools/mcstop+release-smp-x86.sh.in b/arch/x86/tools/mcstop+release-smp-x86.sh.in index 7b727174..83a98cbd 100644 --- a/arch/x86/tools/mcstop+release-smp-x86.sh.in +++ b/arch/x86/tools/mcstop+release-smp-x86.sh.in @@ -21,10 +21,12 @@ cpus="" if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi # Destroy all LWK instances +if ls /dev/mcos* 1>/dev/null 2>&1; then for i in /dev/mcos*; do ind=`echo $i|cut -c10-`; if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi done +fi # Query IHK-SMP resources and release them if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 6d297bdf..6c94ad86 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -110,6 +110,13 @@ struct program_load_desc { }; struct syscall_request { + /* TID of requesting thread */ + int rtid; + /* + * TID of target thread. Remote page fault response needs to designate the + * thread that must serve the request, 0 indicates any thread from the pool + */ + int ttid; unsigned long valid; unsigned long number; unsigned long args[6]; @@ -128,8 +135,17 @@ struct syscall_load_desc { unsigned long size; }; +#define IHK_SCD_REQ_THREAD_SPINNING 0 +#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1 +#define IHK_SCD_REQ_THREAD_DESCHEDULED 2 + struct syscall_response { + /* TID of the thread that requested the service */ + int ttid; + /* TID of the mcexec thread that is serving or has served the request */ + int stid; unsigned long status; + unsigned long req_thread_status; long ret; unsigned long fault_address; unsigned long fault_reason; diff --git a/executer/kernel/mcctrl/control.c b/executer/kernel/mcctrl/control.c index ba1aab7d..ca8fca9e 100644 --- a/executer/kernel/mcctrl/control.c +++ b/executer/kernel/mcctrl/control.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -81,7 +82,6 @@ static long mcexec_prepare_image(ihk_os_t os, void *args, *envs; long ret = 0; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - unsigned long flags; struct mcctrl_per_proc_data *ppd = NULL; if (copy_from_user(&desc, udesc, @@ -124,52 +124,48 @@ static long mcexec_prepare_image(ihk_os_t os, } pdesc->args = (void*)virt_to_phys(args); - printk("args: 0x%lX\n", (unsigned long)pdesc->args); - printk("argc: %ld\n", *(long *)args); + dprintk("args: 0x%lX\n", (unsigned long)pdesc->args); + dprintk("argc: %ld\n", *(long *)args); pdesc->envs = (void*)virt_to_phys(envs); - printk("envs: 0x%lX\n", (unsigned long)pdesc->envs); - printk("envc: %ld\n", *(long *)envs); + dprintk("envs: 0x%lX\n", (unsigned long)pdesc->envs); + dprintk("envc: %ld\n", *(long *)envs); isp.msg = SCD_MSG_PREPARE_PROCESS; isp.ref = pdesc->cpu; isp.arg = virt_to_phys(pdesc); - printk("# of sections: %d\n", pdesc->num_sections); - printk("%p (%lx)\n", pdesc, isp.arg); + dprintk("# of sections: %d\n", pdesc->num_sections); + dprintk("%p (%lx)\n", pdesc, isp.arg); pdesc->status = 0; mcctrl_ikc_send(os, pdesc->cpu, &isp); - wait_event_interruptible(usrdata->wq_prepare, pdesc->status); + while (wait_event_interruptible(usrdata->wq_prepare, pdesc->status) != 0); if(pdesc->err < 0){ ret = pdesc->err; goto free_out; } - ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC); + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); if (!ppd) { - printk("ERROR: allocating per process data\n"); - ret = -ENOMEM; + printk("ERROR: no per process data for PID %d\n", task_tgid_vnr(current)); + ret = -EINVAL; goto free_out; } - ppd->pid = pdesc->pid; + /* Update rpgtable */ ppd->rpgtable = pdesc->rpgtable; - - flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); - list_add_tail(&ppd->list, &usrdata->per_proc_list); - ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); - dprintk("pid %d, rpgtable: 0x%lx added\n", - ppd->pid, ppd->rpgtable); - if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) + sizeof(struct program_image_section) * desc.num_sections)) { ret = -EFAULT; goto free_out; } + dprintk("%s: pid %d, rpgtable: 0x%lx added\n", + __FUNCTION__, ppd->pid, ppd->rpgtable); + ret = 0; free_out: @@ -417,19 +413,200 @@ static long mcexec_get_cpu(ihk_os_t os) return info->n_cpus; } -int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg) +int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid, + struct mcctrl_per_proc_data *ppd) +{ + struct mcctrl_per_proc_data *ppd_iter; + int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK); + int ret = 0; + unsigned long flags; + + /* Check if data for this thread exists and add if not */ + write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags); + list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) { + if (ppd_iter->pid == pid) { + ret = -EBUSY; + goto out; + } + } + + list_add_tail(&ppd->hash, &ud->per_proc_data_hash[hash]); + +out: + write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags); + return ret; +} + +int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid) +{ + struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL; + int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK); + int ret = 0; + unsigned long flags; + + write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags); + list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) { + if (ppd_iter->pid == pid) { + ppd = ppd_iter; + break; + } + } + + if (!ppd) { + ret = -EINVAL; + goto out; + } + + list_del(&ppd->hash); + +out: + write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags); + return ret; +} + +inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data( + struct mcctrl_usrdata *ud, int pid) +{ + struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL; + int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK); + unsigned long flags; + + /* Check if data for this process exists and return it */ + read_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags); + + list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) { + if (ppd_iter->pid == pid) { + ppd = ppd_iter; + break; + } + } + + read_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags); + return ppd; +} + +/* + * Called indirectly from the IKC message handler. + */ +int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet) { struct wait_queue_head_list_node *wqhln = NULL; struct wait_queue_head_list_node *wqhln_iter; + struct wait_queue_head_list_node *wqhln_alloc = NULL; + int pid = packet->pid; unsigned long flags; + struct mcctrl_per_proc_data *ppd; + + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(ud, pid); + + if (unlikely(!ppd)) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return 0; + } + + dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n", + __FUNCTION__, + packet->req.rtid, + packet->req.ttid, + packet->req.number); + /* + * Three scenarios are possible: + * - Find the designated thread if req->ttid is specified. + * - Find any available thread if req->ttid is zero. + * - Add a request element if no threads are available. + */ + flags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + + /* Is this a request for a specific thread? See if it's waiting */ + if (unlikely(packet->req.ttid)) { + list_for_each_entry(wqhln_iter, &ppd->wq_list_exact, list) { + if (packet->req.ttid != task_pid_vnr(wqhln_iter->task)) + continue; - /* Look up per-process wait queue head with pid */ - flags = ihk_ikc_spinlock_lock(&c->wq_list_lock); - list_for_each_entry(wqhln_iter, &c->wq_list, list) { - if (wqhln_iter->pid == pid) { wqhln = wqhln_iter; break; } + if (!wqhln) { + printk("%s: WARNING: no target thread found for exact request??\n", + __FUNCTION__); + } + } + /* Is there any thread available? */ + else { + list_for_each_entry(wqhln_iter, &ppd->wq_list, list) { + if (wqhln_iter->task && !wqhln_iter->req) { + wqhln = wqhln_iter; + break; + } + } + } + + /* If no match found, add request to pending request list */ + if (unlikely(!wqhln)) { +retry_alloc: + wqhln_alloc = kmalloc(sizeof(*wqhln), GFP_ATOMIC); + if (!wqhln_alloc) { + printk("WARNING: coudln't alloc wait queue head, retrying..\n"); + goto retry_alloc; + } + + wqhln = wqhln_alloc; + wqhln->req = 0; + wqhln->task = NULL; + init_waitqueue_head(&wqhln->wq_syscall); + list_add_tail(&wqhln->list, &ppd->wq_req_list); + } + + wqhln->packet = packet; + wqhln->req = 1; + wake_up(&wqhln->wq_syscall); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags); + + return 0; +} + +/* + * Called from an mcexec thread via ioctl(). + */ +int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req) +{ + struct ikc_scd_packet *packet; + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct wait_queue_head_list_node *wqhln = NULL; + struct wait_queue_head_list_node *wqhln_iter; + int ret = 0; + unsigned long irqflags; + struct mcctrl_per_proc_data *ppd; + + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + + if (unlikely(!ppd)) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return -EINVAL; + } + + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current); + if (packet) { + printk("%s: ERROR: packet %p is already registered for thread %d\n", + __FUNCTION__, packet, task_pid_vnr(current)); + return -EBUSY; + } + +retry: + /* Prepare per-thread wait queue head or find a valid request */ + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + /* First see if there is a valid request already that is not yet taken */ + list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) { + if (wqhln_iter->task == NULL && wqhln_iter->req) { + wqhln = wqhln_iter; + wqhln->task = current; + list_del(&wqhln->list); + break; + } } if (!wqhln) { @@ -440,180 +617,86 @@ retry_alloc: goto retry_alloc; } - wqhln->pid = pid; + wqhln->task = current; wqhln->req = 0; init_waitqueue_head(&wqhln->wq_syscall); - list_add_tail(&wqhln->list, &c->wq_list); + + /* Wait for a request.. */ + list_add(&wqhln->list, &ppd->wq_list); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); + + ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); + + /* Remove per-thread wait queue head */ + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + list_del(&wqhln->list); } + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); - wqhln->req = 1; - wake_up(&wqhln->wq_syscall); - ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags); - - return 0; -} - -#ifndef DO_USER_MODE -// static int remaining_job, base_cpu, job_pos; -#endif - -// extern int num_channels; -// extern int mcctrl_dma_abort; - -int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req) -{ - struct syscall_wait_desc swd; - struct mcctrl_channel *c; - struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - struct wait_queue_head_list_node *wqhln; - struct wait_queue_head_list_node *wqhln_iter; - int ret = 0; - unsigned long irqflags; -#ifndef DO_USER_MODE - unsigned long s, w, d; -#endif - -//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu)); - if (copy_from_user(&swd, req, sizeof(swd))) { - return -EFAULT; - } - - if (swd.cpu >= usrdata->num_channels) - return -EINVAL; - - c = get_peer_channel(usrdata, current); - if (c) { - printk("mcexec_wait_syscall:already registered. task %p ch %p\n", - current, c); - return -EBUSY; - } - c = usrdata->channels + swd.cpu; - -#ifdef DO_USER_MODE -retry: - /* Prepare per-process wait queue head */ -retry_alloc: - wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL); - if (!wqhln) { - printk("WARNING: coudln't alloc wait queue head, retrying..\n"); - goto retry_alloc; - } - - wqhln->pid = swd.pid; - wqhln->req = 0; - init_waitqueue_head(&wqhln->wq_syscall); - - irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock); - /* First see if there is one wait queue already */ - list_for_each_entry(wqhln_iter, &c->wq_list, list) { - if (wqhln_iter->pid == task_tgid_vnr(current)) { - kfree(wqhln); - wqhln = wqhln_iter; - list_del(&wqhln->list); - break; - } - } - list_add_tail(&wqhln->list, &c->wq_list); - ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags); - - ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); - - - /* Remove per-process wait queue head */ - irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock); - list_del(&wqhln->list); - ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags); if (ret && !wqhln->req) { kfree(wqhln); + wqhln = NULL; return -EINTR; } + + packet = wqhln->packet; kfree(wqhln); + wqhln = NULL; - if (c->param.request_va->number == 61 && - c->param.request_va->args[0] == swd.pid) { + dprintk("%s: tid: %d request from CPU %d\n", + __FUNCTION__, task_pid_vnr(current), packet->ref); - dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n", - task_tgid_vnr(current), - task_pid_vnr(current); - c->param.request_va->number, - swd.cpu); - - return -EINTR; - } - -#if 1 mb(); - if (!c->param.request_va->valid) { -printk("mcexec_wait_syscall:stray wakeup\n"); + if (!packet->req.valid) { + printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n", + __FUNCTION__, + task_tgid_vnr(current), + task_pid_vnr(current), + packet->req.number); + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); goto retry; } -#endif -#else - while (1) { - c = usrdata->channels + swd.cpu; - ihk_get_tsc(s); - if (!usrdata->remaining_job) { - while (!(*c->param.doorbell_va)) { - mb(); - cpu_relax(); - ihk_get_tsc(w); - if (w > s + 1024UL * 1024 * 1024 * 10) { - return -EINTR; - } - } - d = (*c->param.doorbell_va) - 1; - *c->param.doorbell_va = 0; - if (d < 0 || d >= usrdata->num_channels) { - d = 0; - } - usrdata->base_cpu = d; - usrdata->job_pos = 0; - usrdata->remaining_job = 1; - } else { - usrdata->job_pos++; - } - - for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) { - if (base_cpu + job_pos >= num_channels) { - c = usrdata->channels + - (usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels); - } else { - c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos; - } - if (!c) { - continue; - } - if (c->param.request_va && - c->param.request_va->valid) { -#endif - c->param.request_va->valid = 0; /* ack */ - dprintk("SC #%lx, %lx\n", - c->param.request_va->number, - c->param.request_va->args[0]); - register_peer_channel(usrdata, current, c); - if (__do_in_kernel_syscall(os, c, c->param.request_va)) { - if (copy_to_user(&req->sr, c->param.request_va, - sizeof(struct syscall_request))) { - deregister_peer_channel(usrdata, current, c); - return -EFAULT; - } - return 0; - } - deregister_peer_channel(usrdata, current, c); -#ifdef DO_USER_MODE - goto retry; -#endif -#ifndef DO_USER_MODE - if (usrdata->mcctrl_dma_abort) { - return -2; - } - } - } - usrdata->remaining_job = 0; + packet->req.valid = 0; /* ack */ + dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, " + "args[3]: %lu, args[4]: %lu, args[5]: %lu\n", + __FUNCTION__, + packet->req.number, + packet->req.args[0], + packet->req.args[1], + packet->req.args[2], + packet->req.args[3], + packet->req.args[4], + packet->req.args[5]); + + if (mcctrl_add_per_thread_data(ppd, current, packet) < 0) { + kprintf("%s: error adding per-thread data\n", __FUNCTION__); + return -EINVAL; } -#endif - return 0; + + if (__do_in_kernel_syscall(os, packet)) { + if (copy_to_user(&req->sr, &packet->req, + sizeof(struct syscall_request))) { + + if (mcctrl_delete_per_thread_data(ppd, current) < 0) { + kprintf("%s: error deleting per-thread data\n", __FUNCTION__); + return -EINVAL; + } + return -EFAULT; + } + return 0; + } + + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); + + if (mcctrl_delete_per_thread_data(ppd, current) < 0) { + kprintf("%s: error deleting per-thread data\n", __FUNCTION__); + return -EINVAL; + } + + goto retry; } long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg) @@ -696,33 +779,6 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg) #endif ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size); - -/* - ihk_dma_channel_t channel; - struct ihk_dma_request request; - unsigned long dma_status = 0; - - channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0); - if (!channel) { - return -EINVAL; - } - - memset(&request, 0, sizeof(request)); - request.src_os = os; - request.src_phys = desc.src; - request.dest_os = NULL; - request.dest_phys = desc.dest; - request.size = desc.size; - request.notify = (void *)virt_to_phys(&dma_status); - request.priv = (void *)1; - - ihk_dma_request(channel, &request); - - while (!dma_status) { - mb(); - udelay(1); - } -*/ return 0; } @@ -730,74 +786,60 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg) long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg) { struct syscall_ret_desc ret; - struct mcctrl_channel *mc; + struct ikc_scd_packet *packet; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); -#if 0 - ihk_dma_channel_t channel; - struct ihk_dma_request request; - - channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0); - if (!channel) { - return -EINVAL; - } -#endif + struct mcctrl_per_proc_data *ppd; if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) { return -EFAULT; } - mc = usrdata->channels + ret.cpu; - if (!mc) { + + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + if (!ppd) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); return -EINVAL; } - deregister_peer_channel(usrdata, current, mc); - mc->param.response_va->ret = ret.ret; + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current); + if (!packet) { + kprintf("%s: ERROR: no packet registered for TID %d\n", + __FUNCTION__, task_pid_vnr(current)); + return -EINVAL; + } + + mcctrl_delete_per_thread_data(ppd, current); if (ret.size > 0) { /* Host => Accel. Write is fast. */ unsigned long phys; void *rpm; - phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, - ret.size); + phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, ret.size); #ifdef CONFIG_MIC rpm = ioremap_wc(phys, ret.size); #else rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, ret.size, NULL, 0); #endif - if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) { return -EFAULT; } - mb(); - mc->param.response_va->status = 1; - #ifdef CONFIG_MIC iounmap(rpm); #else ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size); #endif ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size); + } -/* - memset(&request, 0, sizeof(request)); - request.src_os = NULL; - request.src_phys = ret.src; - request.dest_os = os; - request.dest_phys = ret.dest; - request.size = ret.size; - request.notify_os = os; - request.notify = (void *)mc->param.response_rpa; - request.priv = (void *)1; - - ihk_dma_request(channel, &request); -*/ - } else { - mb(); - mc->param.response_va->status = 1; - } + __return_syscall(os, packet, ret.ret, task_pid_vnr(current)); + + /* Free packet */ + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, + (usrdata->channels + packet->ref)->c); return 0; } @@ -862,14 +904,53 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename) int retval; int os_ind = ihk_host_os_get_index(os); char *pathbuf, *fullpath; + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct mcctrl_per_proc_data *ppd = NULL; + int i; if (os_ind < 0) { return EINVAL; } + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + + if (!ppd) { + ppd = kmalloc(sizeof(*ppd), GFP_KERNEL); + if (!ppd) { + printk("ERROR: allocating per process data\n"); + return -ENOMEM; + } + + ppd->pid = task_tgid_vnr(current); + /* + * XXX: rpgtable will be updated in __do_in_kernel_syscall() + * under case __NR_munmap + */ + INIT_LIST_HEAD(&ppd->wq_list); + INIT_LIST_HEAD(&ppd->wq_req_list); + INIT_LIST_HEAD(&ppd->wq_list_exact); + spin_lock_init(&ppd->wq_list_lock); + + for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; ++i) { + INIT_LIST_HEAD(&ppd->per_thread_data_hash[i]); + rwlock_init(&ppd->per_thread_data_hash_lock[i]); + } + + if (mcctrl_add_per_proc_data(usrdata, ppd->pid, ppd) < 0) { + printk("%s: error adding per process data\n", __FUNCTION__); + retval = EINVAL; + goto out_free_ppd; + } + } + else { + /* Only deallocate in case of an error if we added it above */ + ppd = NULL; + } + pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY); if (!pathbuf) { - return ENOMEM; + retval = ENOMEM; + goto out_error_drop_ppd; } file = open_exec(filename); @@ -901,7 +982,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename) break; } } - + /* Add new exec file to the list */ mcef->os = os; mcef->pid = task_tgid_vnr(current); @@ -918,12 +999,15 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename) kfree(pathbuf); return 0; - + out_put_file: fput(file); - out_error_free: kfree(pathbuf); +out_error_drop_ppd: + if (ppd) mcctrl_delete_per_proc_data(usrdata, ppd->pid); +out_free_ppd: + if (ppd) kfree(ppd); return -retval; } @@ -933,6 +1017,23 @@ int mcexec_close_exec(ihk_os_t os) struct mckernel_exec_file *mcef = NULL; int found = 0; int os_ind = ihk_host_os_get_index(os); + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct mcctrl_per_proc_data *ppd = NULL; + + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + + if (ppd) { + mcctrl_delete_per_proc_data(usrdata, ppd->pid); + + dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n", + task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable); + + kfree(ppd); + } + else { + printk("WARNING: no per process data for pid %d ?\n", + task_tgid_vnr(current)); + } if (os_ind < 0) { return EINVAL; diff --git a/executer/kernel/mcctrl/ikc.c b/executer/kernel/mcctrl/ikc.c index d5493db0..a05f45b8 100644 --- a/executer/kernel/mcctrl/ikc.c +++ b/executer/kernel/mcctrl/ikc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "mcctrl.h" #ifdef ATTACHED_MIC #include @@ -40,16 +41,18 @@ void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err); static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c); -int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg); +int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet); void sig_done(unsigned long arg, int err); +/* XXX: this runs in atomic context! */ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, void *__packet, void *__os) { struct ikc_scd_packet *pisp = __packet; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os); + int msg = pisp->msg; - switch (pisp->msg) { + switch (msg) { case SCD_MSG_INIT_CHANNEL: mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c); break; @@ -63,7 +66,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, break; case SCD_MSG_SYSCALL_ONESIDE: - mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg); + mcexec_syscall(usrdata, pisp); break; case SCD_MSG_PROCFS_ANSWER: @@ -88,11 +91,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, break; case SCD_MSG_PROCFS_TID_CREATE: - add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg); - break; - case SCD_MSG_PROCFS_TID_DELETE: - delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg); + procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg); break; case SCD_MSG_GET_VDSO_INFO: @@ -110,6 +110,14 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, pisp->err, pisp->arg); break; } + + /* + * SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it + * mcexec_ret_syscall(), for the rest, free it here. + */ + if (msg != SCD_MSG_SYSCALL_ONESIDE) { + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c); + } return 0; } @@ -146,8 +154,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu) ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c, ihk_ikc_get_processor_id()); - kprintf("Setting the target to %d\n", - ihk_ikc_get_processor_id()); return 0; } @@ -193,12 +199,13 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih #endif pmc->param.request_va = - (void *)__get_free_pages(GFP_KERNEL, + (void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL, REQUEST_SHIFT - PAGE_SHIFT); pmc->param.request_pa = virt_to_phys(pmc->param.request_va); pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va; pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa; - pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL); + pmc->param.post_va = (void *)__get_free_page(in_interrupt() ? + GFP_ATOMIC : GFP_KERNEL); pmc->param.post_pa = virt_to_phys(pmc->param.post_va); memset(pmc->param.doorbell_va, 0, PAGE_SIZE); memset(pmc->param.request_va, 0, PAGE_SIZE); @@ -218,8 +225,9 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih PAGE_SIZE, NULL, 0); #endif - pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL, - DMA_PIN_SHIFT - PAGE_SHIFT); + pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ? + GFP_ATOMIC : GFP_KERNEL, + DMA_PIN_SHIFT - PAGE_SHIFT); rpm->request_page = pmc->param.request_pa; rpm->doorbell_page = pmc->param.doorbell_pa; @@ -265,9 +273,6 @@ static int connect_handler(struct ihk_ikc_channel_info *param) } param->packet_handler = syscall_packet_handler; - INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list); - spin_lock_init(&usrdata->channels[cpu].wq_list_lock); - usrdata->channels[cpu].c = c; kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c); @@ -286,9 +291,6 @@ static int connect_handler2(struct ihk_ikc_channel_info *param) param->packet_handler = syscall_packet_handler; - INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list); - spin_lock_init(&usrdata->channels[cpu].wq_list_lock); - usrdata->channels[cpu].c = c; kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c); @@ -315,7 +317,7 @@ int prepare_ikc_channels(ihk_os_t os) { struct ihk_cpu_info *info; struct mcctrl_usrdata *usrdata; - int error; + int i; usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL); usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL); @@ -347,17 +349,14 @@ int prepare_ikc_channels(ihk_os_t os) memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2); ihk_ikc_listen_port(os, &usrdata->listen_param2); - INIT_LIST_HEAD(&usrdata->per_proc_list); - spin_lock_init(&usrdata->per_proc_list_lock); + for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) { + INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]); + rwlock_init(&usrdata->per_proc_data_hash_lock[i]); + } INIT_LIST_HEAD(&usrdata->cpu_topology_list); INIT_LIST_HEAD(&usrdata->node_topology_list); - error = init_peer_channel_registry(usrdata); - if (error) { - return error; - } - return 0; } @@ -396,7 +395,6 @@ void destroy_ikc_channels(ihk_os_t os) } free_page((unsigned long)usrdata->mcctrl_doorbell_va); - destroy_peer_channel_registry(usrdata); kfree(usrdata->channels); kfree(usrdata); } diff --git a/executer/kernel/mcctrl/mcctrl.h b/executer/kernel/mcctrl/mcctrl.h index cbf483f9..9273597f 100644 --- a/executer/kernel/mcctrl/mcctrl.h +++ b/executer/kernel/mcctrl/mcctrl.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include "sysfs.h" @@ -48,6 +49,7 @@ #define SCD_MSG_PREPARE_PROCESS_ACKED 0x2 #define SCD_MSG_PREPARE_PROCESS_NACKED 0x7 #define SCD_MSG_SCHEDULE_PROCESS 0x3 +#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14 #define SCD_MSG_INIT_CHANNEL 0x5 #define SCD_MSG_INIT_CHANNEL_ACKED 0x6 @@ -110,8 +112,9 @@ struct ikc_scd_packet { int ref; int osnum; int pid; - int padding; unsigned long arg; + struct syscall_request req; + unsigned long resp_pa; }; /* for SCD_MSG_SYSFS_* */ @@ -120,7 +123,13 @@ struct ikc_scd_packet { long sysfs_arg2; long sysfs_arg3; }; + + /* SCD_MSG_SCHEDULE_THREAD */ + struct { + int ttid; + }; }; + char padding[12]; }; struct mcctrl_priv { @@ -154,8 +163,11 @@ struct syscall_params { struct wait_queue_head_list_node { struct list_head list; wait_queue_head_t wq_syscall; - int pid; + struct task_struct *task; + /* Denotes an exclusive wait for requester TID rtid */ + int rtid; int req; + struct ikc_scd_packet *packet; }; struct mcctrl_channel { @@ -163,15 +175,30 @@ struct mcctrl_channel { struct syscall_params param; struct ikc_scd_init_param init; void *dma_buf; - - struct list_head wq_list; - ihk_spinlock_t wq_list_lock; }; +struct mcctrl_per_thread_data { + struct list_head hash; + struct task_struct *task; + void *data; +}; + +#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8 +#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT) +#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1) + struct mcctrl_per_proc_data { - struct list_head list; + struct list_head hash; int pid; unsigned long rpgtable; /* per process, not per OS */ + + struct list_head wq_list; + struct list_head wq_req_list; + struct list_head wq_list_exact; + ihk_spinlock_t wq_list_lock; + + struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE]; + rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE]; }; struct sysfsm_req { @@ -230,6 +257,10 @@ struct node_topology { #define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG)) +#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7 +#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT) +#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1) + struct mcctrl_usrdata { struct ihk_ikc_listen_param listen_param; struct ihk_ikc_listen_param listen_param2; @@ -245,8 +276,9 @@ struct mcctrl_usrdata { unsigned long last_thread_exec; wait_queue_head_t wq_prepare; - struct list_head per_proc_list; - ihk_spinlock_t per_proc_list_lock; + struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE]; + rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE]; + void **keys; struct sysfsm_data sysfsm_data; unsigned long cpu_online[CPU_LONGS]; @@ -273,12 +305,22 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu); ihk_os_t osnum_to_os(int n); /* syscall.c */ -int init_peer_channel_registry(struct mcctrl_usrdata *ud); -void destroy_peer_channel_registry(struct mcctrl_usrdata *ud); -int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); -int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); -struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key); -int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc); +int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet); +int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid, + struct mcctrl_per_proc_data *ppd); +int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid); +inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data( + struct mcctrl_usrdata *ud, int pid); + +int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd, + struct task_struct *task, void *data); +int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd, + struct task_struct *task); +inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data( + struct mcctrl_per_proc_data *ppd, struct task_struct *task); + +void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet, + long ret, int stid); #define PROCFS_NAME_MAX 1000 @@ -301,6 +343,7 @@ struct procfs_file { }; void procfs_answer(unsigned int arg, int err); +int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg); void add_tid_entry(int osnum, int pid, int tid); void add_pid_entry(int osnum, int pid); void delete_tid_entry(int osnum, int pid, int tid); diff --git a/executer/kernel/mcctrl/procfs.c b/executer/kernel/mcctrl/procfs.c index 42b25e8e..96243fc4 100644 --- a/executer/kernel/mcctrl/procfs.c +++ b/executer/kernel/mcctrl/procfs.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "mcctrl.h" #include #include @@ -713,6 +714,57 @@ mckernel_procfs_lseek(struct file *file, loff_t offset, int orig) return file->f_pos; } +struct procfs_work { + void *os; + int msg; + int pid; + unsigned long arg; + struct work_struct work; +}; + +static void procfsm_work_main(struct work_struct *work0) +{ + struct procfs_work *work = container_of(work0, struct procfs_work, work); + + switch (work->msg) { + case SCD_MSG_PROCFS_TID_CREATE: + add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg); + break; + + case SCD_MSG_PROCFS_TID_DELETE: + delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg); + break; + + default: + printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n", + __FUNCTION__, work->msg, work->pid, work->arg); + break; + } + + kfree(work); + return; +} + +int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg) +{ + struct procfs_work *work = NULL; + + work = kzalloc(sizeof(*work), GFP_ATOMIC); + if (!work) { + printk("%s: kzalloc failed\n", __FUNCTION__); + return -1; + } + + work->os = os; + work->msg = msg; + work->pid = pid; + work->arg = arg; + INIT_WORK(&work->work, &procfsm_work_main); + + schedule_work(&work->work); + return 0; +} + static const struct file_operations mckernel_forward_ro = { .llseek = mckernel_procfs_lseek, .read = mckernel_procfs_read, diff --git a/executer/kernel/mcctrl/syscall.c b/executer/kernel/mcctrl/syscall.c index 8751d42e..653b2d23 100644 --- a/executer/kernel/mcctrl/syscall.c +++ b/executer/kernel/mcctrl/syscall.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include @@ -84,88 +85,96 @@ static void print_dma_lastreq(void) } #endif -int init_peer_channel_registry(struct mcctrl_usrdata *ud) +int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd, + struct task_struct *task, void *data) { - ud->keys = kzalloc(sizeof(void *) * ud->num_channels, GFP_KERNEL); - if (!ud->keys) { - printk("Error: cannot allocate usrdata.keys[].\n"); - return -ENOMEM; + struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; + struct mcctrl_per_thread_data *ptd_alloc = NULL; + int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); + int ret = 0; + unsigned long flags; + + ptd_alloc = kmalloc(sizeof(*ptd), GFP_ATOMIC); + if (!ptd_alloc) { + kprintf("%s: error allocate per thread data\n", __FUNCTION__); + ret = -ENOMEM; + goto out_noalloc; } - return 0; -} - -void destroy_peer_channel_registry(struct mcctrl_usrdata *ud) -{ - kfree(ud->keys); - ud->keys = NULL; - return; -} - -int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch) -{ - int cpu; - - cpu = ch - ud->channels; - if ((cpu < 0) || (ud->num_channels <= cpu)) { - printk("register_peer_channel(%p,%p,%p):" - "not a syscall channel. cpu=%d\n", - ud, key, ch, cpu); - return -EINVAL; - } - - if (ud->keys[cpu] != NULL) { - printk("register_peer_channel(%p,%p,%p):" - "already registered. cpu=%d\n", - ud, key, ch, cpu); - /* - * When mcexec receives a signal, - * it may be finished without doing deregister_peer_channel(). - * Therefore a substitute registration is necessary. - */ -#if 0 - return -EBUSY; -#endif - } - - ud->keys[cpu] = key; - return 0; -} - -int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch) -{ - int cpu; - - cpu = ch - ud->channels; - if ((cpu < 0) || (ud->num_channels <= cpu)) { - printk("deregister_peer_channel(%p,%p,%p):" - "not a syscall channel. cpu=%d\n", - ud, key, ch, cpu); - return -EINVAL; - } - - if (ud->keys[cpu] && (ud->keys[cpu] != key)) { - printk("deregister_peer_channel(%p,%p,%p):" - "not registered. cpu=%d\n", - ud, key, ch, cpu); - return -EBUSY; - } - - ud->keys[cpu] = NULL; - return 0; -} - -struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key) -{ - int cpu; - - for (cpu = 0; cpu < ud->num_channels; ++cpu) { - if (ud->keys[cpu] == key) { - return &ud->channels[cpu]; + /* Check if data for this thread exists and add if not */ + write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); + list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { + if (ptd_iter->task == task) { + ptd = ptd_iter; + break; } } - return NULL; + if (unlikely(ptd)) { + ret = -EBUSY; + kfree(ptd_alloc); + goto out; + } + + ptd = ptd_alloc; + ptd->task = task; + ptd->data = data; + list_add_tail(&ptd->hash, &ppd->per_thread_data_hash[hash]); + +out: + write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); +out_noalloc: + return ret; +} + +int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd, + struct task_struct *task) +{ + struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; + int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); + int ret = 0; + unsigned long flags; + + /* Check if data for this thread exists and delete it */ + write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); + list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { + if (ptd_iter->task == task) { + ptd = ptd_iter; + break; + } + } + + if (!ptd) { + ret = -EINVAL; + goto out; + } + + list_del(&ptd->hash); + kfree(ptd); + +out: + write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); + return ret; +} + +struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task) +{ + struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL; + int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK); + unsigned long flags; + + /* Check if data for this thread exists and return it */ + read_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags); + + list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) { + if (ptd_iter->task == task) { + ptd = ptd_iter; + break; + } + } + + read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags); + return ptd ? ptd->data : NULL; } #if 1 /* x86 depend, host OS side */ @@ -232,80 +241,156 @@ out: } #endif +static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet, + struct syscall_response *res) +{ + struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); + struct ihk_ikc_channel_desc *c = (usrdata->channels + packet->ref)->c; + struct ikc_scd_packet r_packet; + int ret = 0; + + /* If spinning, no need for IKC message */ + if (__sync_bool_compare_and_swap(&res->req_thread_status, + IHK_SCD_REQ_THREAD_SPINNING, + IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) { + dprintk("%s: no need to send IKC message for PID %d\n", + __FUNCTION__, packet->pid); + return ret; + } + + /* The thread is not spinning any more, make sure it's descheduled */ + if (!__sync_bool_compare_and_swap(&res->req_thread_status, + IHK_SCD_REQ_THREAD_DESCHEDULED, + IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) { + printk("%s: WARNING: inconsistent requester status, " + "pid: %d, req status: %lu, syscall nr: %lu\n", + __FUNCTION__, packet->pid, + res->req_thread_status, packet->req.number); + dump_stack(); + + return -EINVAL; + } + + r_packet.msg = SCD_MSG_WAKE_UP_SYSCALL_THREAD; + r_packet.ttid = packet->req.rtid; + ret = ihk_ikc_send(c, &r_packet, 0); + + return ret; +} + static int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason) { - struct mcctrl_channel *channel; + struct ikc_scd_packet *packet; struct syscall_request *req; struct syscall_response *resp; int error; + struct wait_queue_head_list_node *wqhln; + unsigned long irqflags; + struct mcctrl_per_proc_data *ppd; + unsigned long phys; - dprintk("remote_page_fault(%p,%p,%llx)\n", usrdata, fault_addr, reason); + dprintk("%s: tid: %d, fault_addr: %lu, reason: %lu\n", + __FUNCTION__, task_pid_vnr(current), fault_addr, reason); + + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); - channel = get_peer_channel(usrdata, current); - if (!channel) { - error = -ENOENT; - printk("remote_page_fault(%p,%p,%llx):channel not found. %d\n", - usrdata, fault_addr, reason, error); - goto out; + if (!ppd) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return -EINVAL; } - req = channel->param.request_va; - resp = channel->param.response_va; + packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current); + if (!packet) { + error = -ENOENT; + printk("%s: no packet registered for TID %d\n", + __FUNCTION__, task_pid_vnr(current)); + goto out_no_unmap; + } - /* request page fault */ + req = &packet->req; + + /* Map response structure */ + phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), + packet->resp_pa, sizeof(*resp)); + resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), + phys, sizeof(*resp), NULL, 0); + +retry_alloc: + wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC); + if (!wqhln) { + printk("WARNING: coudln't alloc wait queue head, retrying..\n"); + goto retry_alloc; + } + + /* Prepare per-thread wait queue head */ + wqhln->task = current; + wqhln->req = 0; + init_waitqueue_head(&wqhln->wq_syscall); + + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); + /* Add to exact list */ + list_add_tail(&wqhln->list, &ppd->wq_list_exact); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); + + /* Request page fault */ resp->ret = -EFAULT; resp->fault_address = (unsigned long)fault_addr; resp->fault_reason = reason; + resp->stid = task_pid_vnr(current); #define STATUS_PAGER_COMPLETED 1 #define STATUS_PAGE_FAULT 3 req->valid = 0; + + if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) { + printk("%s: WARNING: failed to notify PID %d\n", + __FUNCTION__, packet->pid); + } + mb(); resp->status = STATUS_PAGE_FAULT; for (;;) { - struct wait_queue_head_list_node *wqhln; - struct wait_queue_head_list_node *wqhln_iter; - unsigned long irqflags; - -retry_alloc: - wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL); - if (!wqhln) { - printk("WARNING: coudln't alloc wait queue head, retrying..\n"); - goto retry_alloc; - } - - /* Prepare per-process wait queue head */ - wqhln->pid = task_tgid_vnr(current); - wqhln->req = 0; - init_waitqueue_head(&wqhln->wq_syscall); - - irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock); - /* First see if there is a wait queue already */ - list_for_each_entry(wqhln_iter, &channel->wq_list, list) { - if (wqhln_iter->pid == task_tgid_vnr(current)) { - kfree(wqhln); - wqhln = wqhln_iter; - list_del(&wqhln->list); - break; - } - } - list_add_tail(&wqhln->list, &channel->wq_list); - ihk_ikc_spinlock_unlock(&channel->wq_list_lock, irqflags); - + dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n", + __FUNCTION__, task_pid_vnr(current), fault_addr); /* wait for response */ error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req); - - /* Remove per-process wait queue head */ - irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock); + + /* Remove per-thread wait queue head */ + irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock); list_del(&wqhln->list); - ihk_ikc_spinlock_unlock(&channel->wq_list_lock, irqflags); - kfree(wqhln); + ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags); + + dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n", + __FUNCTION__, task_pid_vnr(current), fault_addr); if (error) { + kfree(wqhln); printk("remote_page_fault:interrupted. %d\n", error); goto out; } + else { + /* Update packet reference */ + packet = wqhln->packet; + req = &packet->req; + { + unsigned long phys2; + struct syscall_response *resp2; + phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os), + packet->resp_pa, sizeof(*resp)); + resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os), + phys2, sizeof(*resp), NULL, 0); + + if (resp != resp2) { + resp = resp2; + phys = phys2; + printk("%s: updated new remote PA for resp\n", __FUNCTION__); + } + } + } + if (!req->valid) { printk("remote_page_fault:not valid\n"); } @@ -321,23 +406,37 @@ retry_alloc: #define PAGER_REQ_RESUME 0x0101 else if (req->args[0] != PAGER_REQ_RESUME) { resp->ret = pager_call(usrdata->os, (void *)req); + + if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) { + printk("%s: WARNING: failed to notify PID %d\n", + __FUNCTION__, packet->pid); + } + mb(); resp->status = STATUS_PAGER_COMPLETED; - continue; + break; + //continue; } else { error = req->args[1]; if (error) { printk("remote_page_fault:response %d\n", error); + kfree(wqhln); goto out; } } break; } + kfree(wqhln); error = 0; out: - dprintk("remote_page_fault(%p,%p,%llx): %d\n", usrdata, fault_addr, reason, error); + ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp)); + ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp)); + +out_no_unmap: + dprintk("%s: tid: %d, fault_addr: %lu, reason: %lu, error: %d\n", + __FUNCTION__, task_pid_vnr(current), fault_addr, reason, error); return error; } @@ -389,8 +488,9 @@ static int rus_page_hash_insert(struct page *page) { int ret = 0; struct rus_page *rp; + unsigned long flags; - spin_lock(&rus_page_hash_lock); + spin_lock_irqsave(&rus_page_hash_lock, flags); rp = _rus_page_hash_lookup(page); if (!rp) { @@ -417,7 +517,7 @@ static int rus_page_hash_insert(struct page *page) out: - spin_unlock(&rus_page_hash_lock); + spin_unlock_irqrestore(&rus_page_hash_lock, flags); return ret; } @@ -426,8 +526,9 @@ void rus_page_hash_put_pages(void) int i; struct rus_page *rp_iter; struct rus_page *rp_iter_next; + unsigned long flags; - spin_lock(&rus_page_hash_lock); + spin_lock_irqsave(&rus_page_hash_lock, flags); for (i = 0; i < RUS_PAGE_HASH_SIZE; ++i) { @@ -440,7 +541,7 @@ void rus_page_hash_put_pages(void) } } - spin_unlock(&rus_page_hash_lock); + spin_unlock_irqrestore(&rus_page_hash_lock, flags); } @@ -472,27 +573,22 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) #if USE_VM_INSERT_PFN size_t pix; #endif - struct mcctrl_per_proc_data *ppd, *ppd_iter; - unsigned long flags; + struct mcctrl_per_proc_data *ppd; dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); - ppd = NULL; - flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); - - list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) { - if (ppd_iter->pid == task_tgid_vnr(current) || - ppd_iter->pid == vma->vm_mm->owner->pid) { - ppd = ppd_iter; - break; - } + /* Look up per-process structure */ + ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current)); + if (!ppd) { + ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid); } - ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); + if (!ppd) { - printk("ERROR: no per process data for pid %d\n", task_tgid_vnr(current)); - return VM_FAULT_SIGBUS; + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return -EINVAL; } for (try = 1; ; ++try) { @@ -626,237 +722,6 @@ reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, u return start; } -//unsigned long last_thread_exec = 0; - -#ifndef DO_USER_MODE -static struct { - long (*do_sys_open)(int, const char __user *, int, int); - long (*sys_lseek)(unsigned int, off_t, unsigned int); - long (*sys_read)(unsigned int, char __user *, size_t); - long (*sys_write)(unsigned int, const char __user *, size_t); -} syscalls; - -void -mcctrl_syscall_init(void) -{ - printk("mcctrl_syscall_init\n"); - syscalls.do_sys_open = (void *)kallsyms_lookup_name("do_sys_open"); - syscalls.sys_lseek = (void *)kallsyms_lookup_name("sys_lseek"); - syscalls.sys_read = (void *)kallsyms_lookup_name("sys_read"); - syscalls.sys_write = (void *)kallsyms_lookup_name("sys_write"); - printk("syscalls.do_sys_open=%lx\n", (long)syscalls.do_sys_open); - printk("syscalls.sys_lseek=%lx\n", (long)syscalls.sys_lseek); - printk("syscalls.sys_read=%lx\n", (long)syscalls.sys_read); - printk("syscalls.sys_write=%lx\n", (long)syscalls.sys_write); -} - -static int do_async_copy(ihk_os_t os, unsigned long dest, unsigned long src, - unsigned long size, unsigned int inbound) -{ - struct ihk_dma_request request; - ihk_dma_channel_t channel; - unsigned long asize = ALIGN_WAIT_BUF(size); - - channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0); - if (!channel) { - return -EINVAL; - } - - memset(&request, 0, sizeof(request)); - request.src_os = inbound ? os : NULL; - request.src_phys = src; - request.dest_os = inbound ? NULL : os; - request.dest_phys = dest; - request.size = size; - request.notify = (void *)(inbound ? dest + asize : src + asize); - request.priv = (void *)1; - - *(unsigned long *)phys_to_virt((unsigned long)request.notify) = 0; -#ifdef SC_DEBUG - last_request = request; -#endif - - ihk_dma_request(channel, &request); - - return 0; -} - -//int mcctrl_dma_abort; - -static void async_wait(ihk_os_t os, unsigned char *p, int size) -{ - int asize = ALIGN_WAIT_BUF(size); - unsigned long long s, w; - struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - - rdtscll(s); - while (!p[asize]) { - mb(); - cpu_relax(); - rdtscll(w); - if (w > s + 1024UL * 1024 * 1024 * 10) { - printk("DMA Timed out : %p (%p + %d) => %d\n", - p + asize, p, size, p[asize]); -#ifdef SC_DEBUG - print_dma_lastreq(); -#endif - usrdata->mcctrl_dma_abort = 1; - return; - } - } -} - -static void clear_wait(unsigned char *p, int size) -{ - //int asize = ALIGN_WAIT_BUF(size); - p[size] = 0; -} - -static unsigned long translate_remote_va(struct mcctrl_channel *c, - unsigned long rva) -{ - int i, n; - struct syscall_post *p; - - p = c->param.post_va; - - n = (int)p->v[0]; - if (n < 0 || n >= PAGE_SIZE / sizeof(struct syscall_post)) { - return -EINVAL; - } - for (i = 0; i < n; i++) { - if (p[i + 1].v[0] != 1) { - continue; - } - if (rva >= p[i + 1].v[1] && rva < p[i + 1].v[2]) { - return p[i + 1].v[3] + (rva - p[i + 1].v[1]); - } - } - - return -EFAULT; -} - -//extern struct mcctrl_channel *channels; - -#if 0 -int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, - struct syscall_request *sc) -{ - int ret; - mm_segment_t fs; - unsigned long pa; - struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - - switch (sc->number) { - case 0: /* read */ - case 1024: - if (sc->number & 1024) { - sc->args[1] = translate_remote_va(c, sc->args[1]); - if ((long)sc->args[1] < 0) { - __return_syscall(c, -EFAULT); - return 0; - } - } - - clear_wait(c->dma_buf, sc->args[2]); - fs = get_fs(); - set_fs(KERNEL_DS); - ret = syscalls.sys_read(sc->args[0], c->dma_buf, sc->args[2]); - if (ret > 0) { - do_async_copy(os, sc->args[1], virt_to_phys(c->dma_buf), - sc->args[2], 0); - set_fs(fs); - - async_wait(os, c->dma_buf, sc->args[2]); - } - __return_syscall(c, ret); - return 0; - - case 1: /* write */ - case 1025: - if (sc->number & 1024) { - sc->args[1] = translate_remote_va(c, sc->args[1]); - if ((long)sc->args[1] < 0) { - __return_syscall(c, -EFAULT); - return 0; - } - } - - clear_wait(c->dma_buf, sc->args[2]); - do_async_copy(os, virt_to_phys(c->dma_buf), sc->args[1], - sc->args[2], 1); - fs = get_fs(); - set_fs(KERNEL_DS); - async_wait(os, c->dma_buf, sc->args[2]); - - ret = syscalls.sys_write(sc->args[0], c->dma_buf, sc->args[2]); - set_fs(fs); - - __return_syscall(c, ret); - return 0; - - case 2: /* open */ - case 1026: - if (sc->number & 1024) { - sc->args[0] = translate_remote_va(c, sc->args[0]); - if ((long)sc->args[0] < 0) { - __return_syscall(c, -EFAULT); - return 0; - } - } - - clear_wait(c->dma_buf, 256); - do_async_copy(os, virt_to_phys(c->dma_buf), sc->args[0], - 256, 1); - fs = get_fs(); - set_fs(KERNEL_DS); - async_wait(os, c->dma_buf, 256); - - ret = syscalls.do_sys_open(AT_FDCWD, c->dma_buf, sc->args[1], - sc->args[2]); - set_fs(fs); - - __return_syscall(c, ret); - return 0; - - case 3: /* Close */ - ret = sys_close(sc->args[0]); - __return_syscall(c, ret); - return 0; - - case 8: /* lseek */ - ret = syscalls.sys_lseek(sc->args[0], sc->args[1], sc->args[2]); - __return_syscall(c, ret); - return 0; - - case 56: /* Clone */ - usrdata->last_thread_exec++; - if (mcctrl_ikc_is_valid_thread(usrdata->last_thread_exec)) { - printk("Clone notification: %lx\n", sc->args[0]); - if (channels[usrdata->last_thread_exec].param.post_va) { - memcpy(usrdata->channels[usrdata->last_thread_exec].param.post_va, - c->param.post_va, PAGE_SIZE); - } - mcctrl_ikc_send_msg(usrdata->last_thread_exec, - SCD_MSG_SCHEDULE_PROCESS, - usrdata->last_thread_exec, sc->args[0]); - } - - __return_syscall(c, 0); - return 0; - - default: - if (sc->number & 1024) { - __return_syscall(c, -EFAULT); - return 0; - } else { - return -ENOSYS; - } - } -} -#endif -#endif /* !DO_USER_MODE */ - struct pager { struct list_head list; struct inode * inode; @@ -967,7 +832,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa) up(&pager_sem); - newpager = kzalloc(sizeof(*newpager), GFP_KERNEL); + newpager = kzalloc(sizeof(*newpager), GFP_ATOMIC); if (!newpager) { error = -ENOMEM; printk("pager_req_create(%d,%lx):kzalloc failed. %d\n", fd, (long)result_pa, error); @@ -1223,7 +1088,7 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off, uintptr_t phys; dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa); - pager = kzalloc(sizeof(*pager), GFP_KERNEL); + pager = kzalloc(sizeof(*pager), GFP_ATOMIC); if (!pager) { error = -ENOMEM; printk("pager_req_map(%p,%d,%lx,%lx,%lx):kzalloc failed. %d\n", os, fd, len, off, result_rpa, error); @@ -1475,11 +1340,31 @@ static long pager_call(ihk_os_t os, struct syscall_request *req) return ret; } -static void __return_syscall(struct mcctrl_channel *c, int ret) +void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet, + long ret, int stid) { - c->param.response_va->ret = ret; + unsigned long phys; + struct syscall_response *res; + + phys = ihk_device_map_memory(ihk_os_to_dev(os), + packet->resp_pa, sizeof(*res)); + res = ihk_device_map_virtual(ihk_os_to_dev(os), + phys, sizeof(*res), NULL, 0); + + /* Map response structure and notify offloading thread */ + res->ret = ret; + res->stid = stid; + + if (__notify_syscall_requester(os, packet, res) < 0) { + printk("%s: WARNING: failed to notify PID %d\n", + __FUNCTION__, packet->pid); + } + mb(); - c->param.response_va->status = 1; + res->status = 1; + + ihk_device_unmap_virtual(ihk_os_to_dev(os), res, sizeof(*res)); + ihk_device_unmap_memory(ihk_os_to_dev(os), phys, sizeof(*res)); } static int remap_user_space(uintptr_t rva, size_t len, int prot) @@ -1668,13 +1553,14 @@ fail: #define SCHED_CHECK_SAME_OWNER 0x01 #define SCHED_CHECK_ROOT 0x02 -int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc) +int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet) { + struct syscall_request *sc = &packet->req; int error; long ret = -1; struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os); - dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx)\n", os, c, sc->number, sc->args[0]); + dprintk("%s: system call: %d\n", __FUNCTION__, sc->args[0]); switch (sc->number) { case __NR_mmap: ret = pager_call(os, sc); @@ -1683,25 +1569,19 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall case __NR_munmap: /* Set new remote page table if not zero */ if (sc->args[2]) { - unsigned long flags; struct mcctrl_per_proc_data *ppd = NULL; - ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC); - if (!ppd) { - printk("ERROR: allocating per process data\n"); - error = -ENOMEM; - goto out; + ppd = mcctrl_get_per_proc_data(usrdata, sc->args[3]); + if (unlikely(!ppd)) { + kprintf("%s: ERROR: no per-process structure for PID %d??\n", + __FUNCTION__, task_tgid_vnr(current)); + return -1; } - ppd->pid = task_tgid_vnr(current); ppd->rpgtable = sc->args[2]; - flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); - list_add_tail(&ppd->list, &usrdata->per_proc_list); - ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); - - dprintk("pid: %d, rpgtable: 0x%lx added\n", - ppd->pid, ppd->rpgtable); + dprintk("%s: pid: %d, rpgtable: 0x%lx updated\n", + __FUNCTION__, ppd->pid, ppd->rpgtable); } ret = clear_pte_range(sc->args[0], sc->args[1]); @@ -1712,33 +1592,6 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall break; case __NR_exit_group: { - unsigned long flags; - struct mcctrl_per_proc_data *ppd = NULL, *ppd_iter; - - ppd = NULL; - flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); - - list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) { - if (ppd_iter->pid == task_tgid_vnr(current)) { - ppd = ppd_iter; - break; - } - } - - if (ppd) { - list_del(&ppd->list); - - dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n", - task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable); - - kfree(ppd); - } - else { - printk("WARNING: no per process data for pid %d ?\n", - task_tgid_vnr(current)); - } - - ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); /* Make sure the user space handler will be called as well */ error = -ENOSYS; @@ -1821,10 +1674,11 @@ sched_setparam_out: break; } - __return_syscall(c, ret); + __return_syscall(os, packet, ret, 0); error = 0; out: - dprintk("__do_in_kernel_syscall(%p,%p,%ld %lx): %d %ld\n", os, c, sc->number, sc->args[0], error, ret); + dprintk("%s: system call: %d, error: %d, ret: %ld\n", + __FUNCTION__, sc->number, sc->args[0], error, ret); return error; } diff --git a/executer/kernel/mcctrl/sysfs.c b/executer/kernel/mcctrl/sysfs.c index b446864a..230a4996 100644 --- a/executer/kernel/mcctrl/sysfs.c +++ b/executer/kernel/mcctrl/sysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "mcctrl.h" #include "sysfs_msg.h" diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index 32914b1a..be80aa87 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -870,7 +870,10 @@ struct thread_data_s { pthread_mutex_t *lock; pthread_barrier_t *init_ready; } *thread_data; + int ncpu; +int n_threads; + pid_t master_tid; pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER; @@ -881,7 +884,7 @@ static void *main_loop_thread_func(void *arg) struct thread_data_s *td = (struct thread_data_s *)arg; td->tid = gettid(); - td->remote_tid = (int)td->tid; + td->remote_tid = -1; pthread_barrier_wait(&init_ready); td->ret = main_loop(td->fd, td->cpu, td->lock); @@ -1108,9 +1111,9 @@ void init_worker_threads(int fd) int i; pthread_mutex_init(&lock, NULL); - pthread_barrier_init(&init_ready, NULL, ncpu + 2); + pthread_barrier_init(&init_ready, NULL, n_threads + 2); - for (i = 0; i <= ncpu; ++i) { + for (i = 0; i <= n_threads; ++i) { int ret; thread_data[i].fd = fd; @@ -1520,6 +1523,19 @@ int main(int argc, char **argv) return 1; } + n_threads = ncpu; + if (ncpu > 16) { + n_threads = 16; + } + + /* + * XXX: keep thread_data ncpu sized despite that there are only + * n_threads worker threads in the pool so that signaling code + * keeps working. + * + * TODO: fix signaling code to be independent of TIDs. + * TODO: implement dynaic thread pool resizing. + */ thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1)); memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1)); @@ -1604,7 +1620,7 @@ int main(int argc, char **argv) return 1; } - for (i = 0; i <= ncpu; ++i) { + for (i = 0; i <= n_threads; ++i) { pthread_join(thread_data[i].thread_id, NULL); } @@ -1666,16 +1682,14 @@ do_generic_syscall( } static void -kill_thread(unsigned long cpu) +kill_thread(unsigned long tid) { - if(cpu >= 0 && cpu < ncpu){ - pthread_kill(thread_data[cpu].thread_id, LOCALSIG); - } - else{ - int i; + int i; - for (i = 0; i < ncpu; ++i) { + for (i = 0; i < n_threads; ++i) { + if(thread_data[i].remote_tid == tid){ pthread_kill(thread_data[i].thread_id, LOCALSIG); + break; } } } @@ -1834,6 +1848,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) //pthread_mutex_lock(lock); + thread_data[cpu].remote_tid = w.sr.rtid; + switch (w.sr.number) { case __NR_open: ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX); @@ -1872,13 +1888,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) sig = 0; term = 0; + do_syscall_return(fd, cpu, 0, 0, 0, 0, 0); + /* Drop executable file */ if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) { fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n"); } - do_syscall_return(fd, cpu, 0, 0, 0, 0, 0); - __dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n", w.sr.args[0], cpu); if(w.sr.number == __NR_exit_group){ @@ -1946,6 +1962,39 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) thread_data[oldcpuid].remote_tid = wtid; } + /* + * Number of TIDs and the remote physical address where TIDs are + * expected are passed in arg 4 and 5, respectively. + */ + if (w.sr.args[4] > 0) { + struct remote_transfer trans; + int i = 0; + int *tids = malloc(sizeof(int) * w.sr.args[4]); + if (!tids) { + fprintf(stderr, "__NR_gettid(): error allocating TIDs\n"); + goto gettid_out; + } + + for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) { + tids[i] = thread_data[i].tid; + } + + for (; i < ncpu; ++i) { + tids[i] = 0; + } + + trans.userp = (void*)tids; + trans.rphys = w.sr.args[5]; + trans.size = sizeof(int) * w.sr.args[4]; + trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE; + + if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) { + fprintf(stderr, "__NR_gettid(): error transfering TIDs\n"); + } + + free(tids); + } +gettid_out: do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0); break; } @@ -2041,7 +2090,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) /* Reinit signals and syscall threads */ init_sigaction(); - init_worker_threads(fd); __dprintf("pid(%d): signals and syscall threads OK\n", getpid()); @@ -2055,6 +2103,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) goto fork_child_sync_pipe; } + init_worker_threads(fd); + fork_child_sync_pipe: sem_post(&fs->sem); if (fs->status) @@ -2313,6 +2363,53 @@ return_execve2: do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); break; + case __NR_setresuid: + ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setreuid: + ret = setreuid(w.sr.args[0], w.sr.args[1]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setuid: + ret = setuid(w.sr.args[0]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setresgid: + ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setregid: + ret = setregid(w.sr.args[0], w.sr.args[1]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setgid: + ret = setgid(w.sr.args[0]); + if(ret == -1) + ret = -errno; + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + + case __NR_setfsgid: + ret = setfsgid(w.sr.args[0]); + do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); + break; + case __NR_close: if(w.sr.args[0] == fd) ret = -EBADF; @@ -2346,7 +2443,9 @@ return_execve2: break; } - + + thread_data[cpu].remote_tid = -1; + //pthread_mutex_unlock(lock); } __dprint("timed out.\n"); diff --git a/kernel/debug.c b/kernel/debug.c index 6c9a8214..1061b6fc 100644 --- a/kernel/debug.c +++ b/kernel/debug.c @@ -110,6 +110,7 @@ int __kprintf(const char *format, ...) char buf[KPRINTF_LOCAL_BUF_LEN]; /* Copy into the local buf */ + len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id()); va_start(va, format); len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va); va_end(va); diff --git a/kernel/devobj.c b/kernel/devobj.c index cbaf2a0a..737d42f7 100644 --- a/kernel/devobj.c +++ b/kernel/devobj.c @@ -99,7 +99,7 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp } memset(obj, 0, sizeof(*obj)); - obj->pfn_table = allocate_pages(pfn_npages, IHK_MC_AP_NOWAIT); + obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT); if (!obj->pfn_table) { error = -ENOMEM; kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n", @@ -141,7 +141,7 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp out: if (obj) { if (obj->pfn_table) { - free_pages(obj->pfn_table, pfn_npages); + ihk_mc_free_pages(obj->pfn_table, pfn_npages); } kfree(obj); } @@ -166,6 +166,8 @@ static void devobj_release(struct memobj *memobj) struct devobj *obj = to_devobj(memobj); struct devobj *free_obj = NULL; uintptr_t handle; + const size_t pfn_npages = + (obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1; dkprintf("devobj_release(%p %lx)\n", obj, obj->handle); @@ -194,7 +196,7 @@ static void devobj_release(struct memobj *memobj) } if (obj->pfn_table) { - free_pages(obj->pfn_table, 1); + ihk_mc_free_pages(obj->pfn_table, pfn_npages); } kfree(free_obj); } diff --git a/kernel/host.c b/kernel/host.c index 04965785..7ee64ead 100644 --- a/kernel/host.c +++ b/kernel/host.c @@ -376,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys) } n = p->num_sections; + if (n > 16) { + kprintf("%s: ERROR: more ELF sections than 16??\n", + __FUNCTION__); + return -ENOMEM; + } dkprintf("# of sections: %d\n", n); - if((pn = ihk_mc_allocate(sizeof(struct program_load_desc) - + sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){ + if((pn = kmalloc(sizeof(struct program_load_desc) + + sizeof(struct program_image_section) * n, + IHK_MC_AP_NOWAIT)) == NULL){ ihk_mc_unmap_virtual(p, npages, 0); ihk_mc_unmap_memory(NULL, phys, sz); return -ENOMEM; @@ -388,7 +394,7 @@ static int process_msg_prepare_process(unsigned long rphys) + sizeof(struct program_image_section) * n); if((thread = create_thread(p->entry)) == NULL){ - ihk_mc_free(pn); + kfree(pn); ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_memory(NULL, phys, sz); return -ENOMEM; @@ -438,7 +444,7 @@ static int process_msg_prepare_process(unsigned long rphys) dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid, vm->address_space->page_table); - ihk_mc_free(pn); + kfree(pn); ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_memory(NULL, phys, sz); @@ -446,7 +452,7 @@ static int process_msg_prepare_process(unsigned long rphys) return 0; err: - ihk_mc_free(pn); + kfree(pn); ihk_mc_unmap_virtual(p, npages, 1); ihk_mc_unmap_memory(NULL, phys, sz); destroy_thread(thread); @@ -455,7 +461,7 @@ err: static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam) { - lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0); + lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0); lparam->response_pa = virt_to_phys(lparam->response_va); pcp->request_page = 0; @@ -524,12 +530,7 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c, } extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont); -extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid); - extern void process_procfs_request(unsigned long rarg); -extern int memcheckall(); -extern int freecheck(int runcount); -extern int runcount; extern void terminate_host(int pid); extern void debug_log(long); @@ -564,6 +565,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, struct ikc_scd_packet *packet = __packet; struct ikc_scd_packet pckt; int rc; + struct mcs_rwlock_node_irqsave lock; struct thread *thread; struct process *proc; struct mcctrl_signal { @@ -575,22 +577,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, } *sp, info; unsigned long pp; int cpuid; + int ret = 0; switch (packet->msg) { case SCD_MSG_INIT_CHANNEL_ACKED: dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n"); process_msg_init_acked(c, packet->arg); - return 0; + ret = 0; + break; case SCD_MSG_PREPARE_PROCESS: - if (find_command_line("memdebug")) { - memcheckall(); - if (runcount) - freecheck(runcount); - runcount++; - } - if((rc = process_msg_prepare_process(packet->arg)) == 0){ pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED; pckt.err = 0; @@ -603,19 +600,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, pckt.arg = packet->arg; syscall_channel_send(c, &pckt); - return 0; + ret = 0; + break; case SCD_MSG_SCHEDULE_PROCESS: cpuid = obtain_clone_cpuid(); if(cpuid == -1){ kprintf("No CPU available\n"); - return -1; + ret = -1; + break; } dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg); thread = (struct thread *)packet->arg; proc = thread->proc; - settid(thread, 0, cpuid, -1); + settid(thread, 0, cpuid, -1, 0, NULL); proc->status = PS_RUNNING; thread->status = PS_RUNNING; chain_thread(thread); @@ -623,7 +622,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, runq_add_thread(thread, cpuid); //cpu_local_var(next) = (struct thread *)packet->arg; - return 0; + ret = 0; + break; + + /* + * Used for syscall offload reply message to explicitly schedule in + * the waiting thread + */ + case SCD_MSG_WAKE_UP_SYSCALL_THREAD: + thread = find_thread(0, packet->ttid, &lock); + if (!thread) { + kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n", + __FUNCTION__, packet->ttid); + ret = -EINVAL; + break; + } + thread_unlock(thread, &lock); + + dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n", + __FUNCTION__, packet->ttid); + waitq_wakeup(&thread->scd_wq); + ret = 0; + break; + case SCD_MSG_SEND_SIGNAL: pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal)); sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE); @@ -638,18 +659,25 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0); kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc); - return 0; + ret = 0; + break; + case SCD_MSG_PROCFS_REQUEST: process_procfs_request(packet->arg); - return 0; + ret = 0; + break; + case SCD_MSG_CLEANUP_PROCESS: dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid); terminate_host(packet->pid); - return 0; + ret = 0; + break; + case SCD_MSG_DEBUG_LOG: dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg); debug_log(packet->arg); - return 0; + ret = 0; + break; case SCD_MSG_SYSFS_REQ_SHOW: case SCD_MSG_SYSFS_REQ_STORE: @@ -657,7 +685,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, sysfss_packet_handler(c, packet->msg, packet->err, packet->sysfs_arg1, packet->sysfs_arg2, packet->sysfs_arg3); - return 0; + ret = 0; + break; case SCD_MSG_GET_CPU_MAPPING: req_get_cpu_mapping(packet->arg); @@ -665,17 +694,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING; pckt.arg = packet->arg; syscall_channel_send(c, &pckt); - return 0; + ret = 0; + break; default: kprintf("syscall_pakcet_handler:unknown message " "(%d.%d.%d.%d.%d.%#lx)\n", packet->msg, packet->ref, packet->osnum, packet->pid, packet->err, packet->arg); - return 0; + ret = 0; + break; } - return 0; + + ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c); + return ret; } void init_host_syscall_channel(void) diff --git a/kernel/include/cls.h b/kernel/include/cls.h index 58532c08..d7bad237 100644 --- a/kernel/include/cls.h +++ b/kernel/include/cls.h @@ -19,11 +19,13 @@ * CPU Local Storage (cls) */ -struct malloc_header { - unsigned int check; +struct kmalloc_header { + unsigned int front_magic; unsigned int cpu_id; - struct malloc_header *next; - unsigned long size; + struct list_head list; + int size; /* The size of this chunk without the header */ + unsigned int end_magic; + /* 32 bytes */ }; #include @@ -38,8 +40,9 @@ extern ihk_spinlock_t cpu_status_lock; struct cpu_local_var { /* malloc */ - struct malloc_header free_list; - struct malloc_header *remote_free_list; + struct list_head free_list; + struct list_head remote_free_list; + ihk_spinlock_t remote_free_list_lock; struct thread idle; struct process idle_proc; @@ -73,6 +76,7 @@ struct cpu_local_var { int in_interrupt; int no_preempt; int timer_enabled; + int kmalloc_initialized; } __attribute__((aligned(64))); diff --git a/kernel/include/kmalloc.h b/kernel/include/kmalloc.h index 6f523ec8..c91cd0cc 100644 --- a/kernel/include/kmalloc.h +++ b/kernel/include/kmalloc.h @@ -32,11 +32,10 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line); void _kfree(void *ptr, char *file, int line); void *__kmalloc(int size, enum ihk_mc_ap_flag flag); void __kfree(void *ptr); -void *___kmalloc(int size, enum ihk_mc_ap_flag flag); -void ___kfree(void *ptr); int _memcheck(void *ptr, char *msg, char *file, int line, int free); int memcheckall(); int freecheck(int runcount); +void kmalloc_consolidate_free_list(void); #endif diff --git a/kernel/include/process.h b/kernel/include/process.h index ab409aa4..30065606 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -161,7 +161,7 @@ #endif #define USER_STACK_NR_PAGES 8192 -#define KERNEL_STACK_NR_PAGES 25 +#define KERNEL_STACK_NR_PAGES 32 #define NOPHYS ((uintptr_t)-1) @@ -349,6 +349,11 @@ struct sig_pending { typedef void pgio_func_t(void *arg); +struct mcexec_tid { + int tid; + struct thread *thread; +}; + /* Represents a node in the process fork tree, it may exist even after the * corresponding process exited due to references from the parent and/or * children and is used for implementing wait/waitpid without having a @@ -363,6 +368,9 @@ struct process { // threads and children struct list_head threads_list; mcs_rwlock_lock_t threads_lock; // lock for threads_list + /* TID set of proxy process */ + struct mcexec_tid *tids; + int nr_tids; /* The ptracing process behave as the parent of the ptraced process after using PTRACE_ATTACH except getppid. So we save it here. */ @@ -559,6 +567,9 @@ struct thread { struct itimerval itimer_prof; struct timespec itimer_virtual_value; struct timespec itimer_prof_value; + + /* Syscall offload wait queue head */ + struct waitq scd_wq; }; struct process_vm { @@ -679,5 +690,7 @@ void chain_thread(struct thread *); void proc_init(); void set_timer(); struct sig_pending *hassigpending(struct thread *thread); +void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid, + int nr_tids, int *tids); #endif diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 4aaf3244..ae4c0c7f 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -31,6 +31,7 @@ #define SCD_MSG_PREPARE_PROCESS_ACKED 0x2 #define SCD_MSG_PREPARE_PROCESS_NACKED 0x7 #define SCD_MSG_SCHEDULE_PROCESS 0x3 +#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14 #define SCD_MSG_INIT_CHANNEL 0x5 #define SCD_MSG_INIT_CHANNEL_ACKED 0x6 @@ -117,28 +118,6 @@ struct user_desc { unsigned int lm:1; }; -struct ikc_scd_packet { - int msg; - int err; - union { - /* for traditional SCD_MSG_* */ - struct { - int ref; - int osnum; - int pid; - int padding; - unsigned long arg; - }; - - /* for SCD_MSG_SYSFS_* */ - struct { - long sysfs_arg1; - long sysfs_arg2; - long sysfs_arg3; - }; - }; -}; - struct program_image_section { unsigned long vaddr; unsigned long len; @@ -210,13 +189,58 @@ struct ikc_scd_init_param { }; struct syscall_request { + /* TID of requesting thread */ + int rtid; + /* + * TID of target thread. Remote page fault response needs to designate the + * thread that must serve the request, 0 indicates any thread from the pool + */ + int ttid; unsigned long valid; unsigned long number; unsigned long args[6]; }; +struct ikc_scd_packet { + int msg; + int err; + union { + /* for traditional SCD_MSG_* */ + struct { + int ref; + int osnum; + int pid; + unsigned long arg; + struct syscall_request req; + unsigned long resp_pa; + }; + + /* for SCD_MSG_SYSFS_* */ + struct { + long sysfs_arg1; + long sysfs_arg2; + long sysfs_arg3; + }; + + /* SCD_MSG_SCHEDULE_THREAD */ + struct { + int ttid; + }; + }; + char padding[12]; +}; + +#define IHK_SCD_REQ_THREAD_SPINNING 0 +#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1 +#define IHK_SCD_REQ_THREAD_DESCHEDULED 2 + struct syscall_response { + /* TID of the thread that requested the service */ + int ttid; + /* TID of the mcexec thread that is serving the request */ + int stid; unsigned long status; + unsigned long req_thread_status; long ret; unsigned long fault_address; unsigned long fault_reason; diff --git a/kernel/init.c b/kernel/init.c index 9f82ca01..0f73ef17 100644 --- a/kernel/init.c +++ b/kernel/init.c @@ -371,7 +371,7 @@ int main(void) } kmsg_init(mode); - kputs("MCK started.\n"); + kputs("IHK/McKernel started.\n"); arch_init(); @@ -393,7 +393,7 @@ int main(void) futex_init(); - kputs("MCK/IHK booted.\n"); + kputs("IHK/McKernel booted.\n"); #ifdef DCFA_KMOD mc_cmd_client_init(); diff --git a/kernel/mem.c b/kernel/mem.c index 2df09209..c87d43ba 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -156,13 +156,17 @@ void sbox_write(int offset, unsigned int value); static void query_free_mem_interrupt_handler(void *priv) { -#ifdef ATTACHED_MIC - dkprintf("query free mem handler!\n"); - int pages = ihk_pagealloc_query_free(pa_allocator); - dkprintf("free pages: %d\n", pages); + kprintf("McKernel free pages: %d\n", pages); + if (find_command_line("memdebug")) { + extern void kmalloc_memcheck(void); + + kmalloc_memcheck(); + } + +#ifdef ATTACHED_MIC sbox_write(SBOX_SCRATCH0, pages); sbox_write(SBOX_SCRATCH1, 1); #endif @@ -265,6 +269,13 @@ void remote_flush_tlb_cpumask(struct process_vm *vm, unsigned long tsc; tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */ #endif + if (flush_entry->addr) { + flush_tlb_single(flush_entry->addr & PAGE_MASK); + } + /* Zero address denotes full TLB flush */ + else { + flush_tlb(); + } /* Wait for all cores */ while (ihk_atomic_read(&flush_entry->pending) != 0) { @@ -335,10 +346,9 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs) // no return } - kprintf("[%d]page_fault_handler(%p,%lx,%p):" - "fault vm failed. %d, TID: %d\n", - ihk_mc_get_processor_id(), fault_addr, - reason, regs, error, thread->tid); + kprintf("%s fault VM failed for TID: %d, addr: 0x%lx, " + "reason: %d, error: %d\n", __FUNCTION__, + thread->tid, fault_addr, reason, error); unhandled_page_fault(thread, fault_addr, regs); preempt_enable(); memset(&info, '\0', sizeof info); @@ -425,8 +435,9 @@ static void page_allocator_init(void) ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages); - kprintf("Available pages: %ld pages\n", - ihk_pagealloc_count(pa_allocator)); + kprintf("Available memory: %ld bytes in %ld pages\n", + (ihk_pagealloc_count(pa_allocator) * PAGE_SIZE), + ihk_pagealloc_count(pa_allocator)); /* Notify the ihk to use my page allocator */ ihk_mc_set_page_allocator(&allocator); @@ -507,6 +518,9 @@ static void page_init(void) static char *memdebug = NULL; +static void *___kmalloc(int size, enum ihk_mc_ap_flag flag); +static void ___kfree(void *ptr); + void register_kmalloc(void) { if(memdebug){ @@ -636,60 +650,100 @@ void mem_init(void) } } -struct location { - struct location *next; - int line; - int cnt; - char file[0]; -}; +#define KMALLOC_TRACK_HASH_SHIFT (8) +#define KMALLOC_TRACK_HASH_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT) +#define KMALLOC_TRACK_HASH_MASK (KMALLOC_TRACK_HASH_SIZE - 1) -struct alloc { - struct alloc *next; - struct malloc_header *p; - struct location *loc; - int size; +struct list_head kmalloc_track_hash[KMALLOC_TRACK_HASH_SIZE]; +ihk_spinlock_t kmalloc_track_hash_locks[KMALLOC_TRACK_HASH_SIZE]; + +struct list_head kmalloc_addr_hash[KMALLOC_TRACK_HASH_SIZE]; +ihk_spinlock_t kmalloc_addr_hash_locks[KMALLOC_TRACK_HASH_SIZE]; + +int kmalloc_track_initialized = 0; +int kmalloc_runcount = 0; + +struct kmalloc_track_addr_entry { + void *addr; int runcount; + struct list_head list; /* track_entry's list */ + struct kmalloc_track_entry *entry; + struct list_head hash; /* address hash */ }; -#define HASHNUM 129 +struct kmalloc_track_entry { + char *file; + int line; + int size; + ihk_atomic_t alloc_count; + struct list_head hash; + struct list_head addr_list; + ihk_spinlock_t addr_list_lock; +}; -static struct alloc *allochash[HASHNUM]; -static struct location *lochash[HASHNUM]; -static ihk_spinlock_t alloclock; -int runcount; -static unsigned char *page; -static int space; - -static void *dalloc(unsigned long size) +void kmalloc_init(void) { - void *r; - static int pos = 0; - unsigned long irqstate; + struct cpu_local_var *v = get_this_cpu_local_var(); - irqstate = ihk_mc_spinlock_lock(&alloclock); - size = (size + 7) & 0xfffffffffffffff8L; - if (pos + size > space) { - page = allocate_pages(1, IHK_MC_AP_NOWAIT); - space = 4096; - pos = 0; + register_kmalloc(); + + INIT_LIST_HEAD(&v->free_list); + INIT_LIST_HEAD(&v->remote_free_list); + ihk_mc_spinlock_init(&v->remote_free_list_lock); + + v->kmalloc_initialized = 1; + + if (!kmalloc_track_initialized) { + int i; + + memdebug = find_command_line("memdebug"); + + kmalloc_track_initialized = 1; + for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) { + ihk_mc_spinlock_init(&kmalloc_track_hash_locks[i]); + INIT_LIST_HEAD(&kmalloc_track_hash[i]); + ihk_mc_spinlock_init(&kmalloc_addr_hash_locks[i]); + INIT_LIST_HEAD(&kmalloc_addr_hash[i]); + } } - r = page + pos; - pos += size; - ihk_mc_spinlock_unlock(&alloclock, irqstate); - - return r; } +/* NOTE: Hash lock must be held */ +struct kmalloc_track_entry *__kmalloc_track_find_entry( + int size, char *file, int line) +{ + struct kmalloc_track_entry *entry_iter, *entry = NULL; + int hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK; + + list_for_each_entry(entry_iter, &kmalloc_track_hash[hash], hash) { + if (!strcmp(entry_iter->file, file) && + entry_iter->size == size && + entry_iter->line == line) { + entry = entry_iter; + break; + } + } + + if (entry) { + dkprintf("%s found entry %s:%d size: %d\n", __FUNCTION__, + file, line, size); + } + else { + dkprintf("%s couldn't find entry %s:%d size: %d\n", __FUNCTION__, + file, line, size); + } + + return entry; +} + +/* Top level routines called from macro */ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line) { - char *r = ___kmalloc(size, flag); - struct malloc_header *h; - unsigned long hash; - char *t; - struct location *lp; - struct alloc *ap; - unsigned long alcsize; - unsigned long chksize; + unsigned long irqflags; + struct kmalloc_track_entry *entry; + struct kmalloc_track_addr_entry *addr_entry; + int hash, addr_hash; + void *r = ___kmalloc(size, flag); if (!memdebug) return r; @@ -697,177 +751,177 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line) if (!r) return r; - h = ((struct malloc_header *)r) - 1; - alcsize = h->size * sizeof(struct malloc_header); - chksize = alcsize - size; - memset(r + size, '\x5a', chksize); + hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK; + irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]); - for (hash = 0, t = file; *t; t++) { - hash <<= 1; - hash += *t; + entry = __kmalloc_track_find_entry(size, file, line); + + if (!entry) { + entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT); + if (!entry) { + kprintf("%s: ERROR: allocating tracking entry\n"); + goto out; + } + + entry->line = line; + entry->size = size; + ihk_atomic_set(&entry->alloc_count, 0); + ihk_mc_spinlock_init(&entry->addr_list_lock); + INIT_LIST_HEAD(&entry->addr_list); + + entry->file = ___kmalloc(strlen(file) + 1, IHK_MC_AP_NOWAIT); + if (!entry->file) { + kprintf("%s: ERROR: allocating file string\n"); + ___kfree(entry); + ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags); + goto out; + } + + strcpy(entry->file, file); + entry->file[strlen(file)] = 0; + list_add(&entry->hash, &kmalloc_track_hash[hash]); + dkprintf("%s entry %s:%d size: %d added\n", __FUNCTION__, + file, line, size); } - hash += line; - hash %= HASHNUM; - for (lp = lochash[hash]; lp; lp = lp->next) - if (lp->line == line && - !strcmp(lp->file, file)) - break; - if (!lp) { - lp = dalloc(sizeof(struct location) + strlen(file) + 1); - memset(lp, '\0', sizeof(struct location)); - lp->line = line; - strcpy(lp->file, file); - do { - lp->next = lochash[hash]; - } while (!compare_and_swap(lochash + hash, (unsigned long)lp->next, (unsigned long)lp)); + ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags); + + ihk_atomic_inc(&entry->alloc_count); + + /* Add new addr entry for this allocation entry */ + addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT); + if (!addr_entry) { + kprintf("%s: ERROR: allocating addr entry\n"); + goto out; } - hash = (unsigned long)h % HASHNUM; - do { - for (ap = allochash[hash]; ap; ap = ap->next) - if (!ap->p) - break; - } while (ap && !compare_and_swap(&ap->p, 0UL, (unsigned long)h)); - if (!ap) { - ap = dalloc(sizeof(struct alloc)); - memset(ap, '\0', sizeof(struct alloc)); - ap->p = h; - do { - ap->next = allochash[hash]; - } while (!compare_and_swap(allochash + hash, (unsigned long)ap->next, (unsigned long)ap)); - } + addr_entry->addr = r; + addr_entry->runcount = kmalloc_runcount; + addr_entry->entry = entry; - ap->loc = lp; - ap->size = size; - ap->runcount = runcount; + irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock); + list_add(&addr_entry->list, &entry->addr_list); + ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags); - return r; -} + /* Add addr entry to address hash */ + addr_hash = ((unsigned long)r >> 5) & KMALLOC_TRACK_HASH_MASK; + irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[addr_hash]); + list_add(&addr_entry->hash, &kmalloc_addr_hash[addr_hash]); + ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[addr_hash], irqflags); -int _memcheck(void *ptr, char *msg, char *file, int line, int flags) -{ - struct malloc_header *h = ((struct malloc_header *)ptr) - 1; - struct malloc_header *next; - unsigned long hash = (unsigned long)h % HASHNUM; - struct alloc *ap; - static unsigned long check = 0x5a5a5a5a5a5a5a5aUL; - unsigned long alcsize; - unsigned long chksize; - - - if (h->check != 0x5a5a5a5a) { - int i; - unsigned long max = 0; - unsigned long cur = (unsigned long)h; - struct alloc *maxap = NULL; - - for (i = 0; i < HASHNUM; i++) - for (ap = allochash[i]; ap; ap = ap->next) - if ((unsigned long)ap->p < cur && - (unsigned long)ap->p > max) { - max = (unsigned long)ap->p; - maxap = ap; - } - - kprintf("%s: detect buffer overrun, alc=%s:%d size=%ld h=%p, s=%ld\n", msg, maxap->loc->file, maxap->loc->line, maxap->size, maxap->p, maxap->p->size); - kprintf("broken header: h=%p next=%p size=%ld cpu_id=%d\n", h, h->next, h->size, h->cpu_id); - } - - for (ap = allochash[hash]; ap; ap = ap->next) - if (ap->p == h) - break; - if (!ap) { - if(file) - kprintf("%s: address not found, %s:%d p=%p\n", msg, file, line, ptr); - else - kprintf("%s: address not found p=%p\n", msg, ptr); - return 1; - } - - alcsize = h->size * sizeof(struct malloc_header); - chksize = alcsize - ap->size; - if (chksize > 8) - chksize = 8; - next = (struct malloc_header *)((char *)ptr + alcsize); - - if (next->check != 0x5a5a5a5a || - memcmp((char *)ptr + ap->size, &check, chksize)) { - unsigned long buf = 0x5a5a5a5a5a5a5a5aUL; - unsigned char *p; - unsigned char *q; - memcpy(&buf, (char *)ptr + ap->size, chksize); - p = (unsigned char *)&(next->check); - q = (unsigned char *)&buf; - - if (file) - kprintf("%s: broken, %s:%d alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, file, line, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size); - else - kprintf("%s: broken, alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size); - - - if (next->check != 0x5a5a5a5a) - kprintf("next->HEADER: next=%p size=%ld cpu_id=%d\n", next->next, next->size, next->cpu_id); - - return 1; - } - - if(flags & 1){ - ap->p = NULL; - ap->loc = NULL; - ap->size = 0; - } - return 0; -} - -int memcheckall() -{ - int i; - struct alloc *ap; - int r = 0; - - for(i = 0; i < HASHNUM; i++) - for(ap = allochash[i]; ap; ap = ap->next) - if(ap->p) - r |= _memcheck(ap->p + 1, "memcheck", NULL, 0, 2); - return r; -} - -int freecheck(int runcount) -{ - int i; - struct alloc *ap; - struct location *lp; - int r = 0; - - for (i = 0; i < HASHNUM; i++) - for (lp = lochash[i]; lp; lp = lp->next) - lp->cnt = 0; - - for (i = 0; i < HASHNUM; i++) - for (ap = allochash[i]; ap; ap = ap->next) - if (ap->p && ap->runcount == runcount) { - ap->loc->cnt++; - r++; - } - - if (r) { - kprintf("memory leak?\n"); - for (i = 0; i < HASHNUM; i++) - for (lp = lochash[i]; lp; lp = lp->next) - if (lp->cnt) - kprintf(" alc=%s:%d cnt=%d\n", lp->file, lp->line, lp->cnt); - } + dkprintf("%s addr_entry %p added\n", __FUNCTION__, r); +out: return r; } void _kfree(void *ptr, char *file, int line) { - if (memdebug) - _memcheck(ptr, "KFREE", file, line, 1); + unsigned long irqflags; + struct kmalloc_track_entry *entry; + struct kmalloc_track_addr_entry *addr_entry_iter, *addr_entry = NULL; + int hash; + + if (!memdebug) { + goto out; + } + + hash = ((unsigned long)ptr >> 5) & KMALLOC_TRACK_HASH_MASK; + irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[hash]); + list_for_each_entry(addr_entry_iter, + &kmalloc_addr_hash[hash], hash) { + if (addr_entry_iter->addr == ptr) { + addr_entry = addr_entry_iter; + break; + } + } + + if (addr_entry) { + list_del(&addr_entry->hash); + } + ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[hash], irqflags); + + if (!addr_entry) { + kprintf("%s: ERROR: kfree()ing invalid pointer\n", __FUNCTION__); + panic("panic"); + } + + entry = addr_entry->entry; + + irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock); + list_del(&addr_entry->list); + ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags); + + dkprintf("%s addr_entry %p removed\n", __FUNCTION__, addr_entry->addr); + ___kfree(addr_entry); + + /* Do we need to remove tracking entry as well? */ + if (!ihk_atomic_dec_and_test(&entry->alloc_count)) { + goto out; + } + + hash = (strlen(entry->file) + entry->line + entry->size) & + KMALLOC_TRACK_HASH_MASK; + irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]); + list_del(&entry->hash); + ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags); + + dkprintf("%s entry %s:%d size: %d removed\n", __FUNCTION__, + entry->file, entry->line, entry->size); + ___kfree(entry->file); + ___kfree(entry); + +out: ___kfree(ptr); } +void kmalloc_memcheck(void) +{ + int i; + unsigned long irqflags; + struct kmalloc_track_entry *entry = NULL; + + for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) { + irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[i]); + list_for_each_entry(entry, &kmalloc_track_hash[i], hash) { + struct kmalloc_track_addr_entry *addr_entry = NULL; + int cnt = 0; + + ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock); + list_for_each_entry(addr_entry, &entry->addr_list, list) { + + dkprintf("%s memory leak: %p @ %s:%d size: %d runcount: %d\n", + __FUNCTION__, + addr_entry->addr, + entry->file, + entry->line, + entry->size, + addr_entry->runcount); + + if (kmalloc_runcount != addr_entry->runcount) + continue; + + cnt++; + } + ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock); + + if (!cnt) + continue; + + kprintf("%s memory leak: %s:%d size: %d cnt: %d, runcount: %d\n", + __FUNCTION__, + entry->file, + entry->line, + entry->size, + cnt, + kmalloc_runcount); + } + ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[i], irqflags); + } + + ++kmalloc_runcount; +} + +/* Redirection routines registered in alloc structure */ void *__kmalloc(int size, enum ihk_mc_ap_flag flag) { return kmalloc(size, flag); @@ -878,160 +932,199 @@ void __kfree(void *ptr) kfree(ptr); } -void kmalloc_init(void) + +static void ___kmalloc_insert_chunk(struct list_head *free_list, + struct kmalloc_header *chunk) { - struct cpu_local_var *v = get_this_cpu_local_var(); - struct malloc_header *h = &v->free_list; - int i; + struct kmalloc_header *chunk_iter, *next_chunk = NULL; - h->check = 0x5a5a5a5a; - h->next = &v->free_list; - h->size = 0; - - register_kmalloc(); - - memdebug = find_command_line("memdebug"); - for (i = 0; i < HASHNUM; i++) { - allochash[i] = NULL; - lochash[i] = NULL; - } - page = allocate_pages(16, IHK_MC_AP_NOWAIT); - space = 16 * 4096; - ihk_mc_spinlock_init(&alloclock); -} - -void ____kfree(struct cpu_local_var *v, struct malloc_header *p) -{ - struct malloc_header *h = &v->free_list; - int combined = 0; - - h = h->next; - - while ((p < h || p > h->next) && h != &v->free_list) { - h = h->next; - } - - if (h + h->size + 1 == p && h->size != 0) { - combined = 1; - h->size += p->size + 1; - h->check = 0x5a5a5a5a; - } - if (h->next == p + p->size + 1 && h->next->size != 0) { - if (combined) { - h->check = 0x5a5a5a5a; - h->size += h->next->size + 1; - h->next = h->next->next; - } else { - p->check = 0x5a5a5a5a; - p->size += h->next->size + 1; - p->next = h->next->next; - h->next = p; + /* Find out where to insert */ + list_for_each_entry(chunk_iter, free_list, list) { + if ((void *)chunk < (void *)chunk_iter) { + next_chunk = chunk_iter; + break; } - } else if (!combined) { - p->next = h->next; - h->next = p; } + + /* Add in front of next */ + if (next_chunk) { + list_add_tail(&chunk->list, &next_chunk->list); + } + /* Add after the head */ + else { + list_add(&chunk->list, free_list); + } + + return; } -void *___kmalloc(int size, enum ihk_mc_ap_flag flag) +static void ___kmalloc_init_chunk(struct kmalloc_header *h, int size) { - struct cpu_local_var *v = get_this_cpu_local_var(); - struct malloc_header *h = &v->free_list, *prev, *p; - int u, req_page; + h->size = size; + h->front_magic = 0x5c5c5c5c; + h->end_magic = 0x6d6d6d6d; + h->cpu_id = ihk_mc_get_processor_id(); +} - p = (struct malloc_header *)xchg8((unsigned long *)&v->remote_free_list, 0L); - while(p){ - struct malloc_header *n = p->next; - ____kfree(v, p); - p = n; +static void ___kmalloc_consolidate_list(struct list_head *list) +{ + struct kmalloc_header *chunk_iter, *chunk, *next_chunk; + +reiterate: + chunk_iter = NULL; + chunk = NULL; + + list_for_each_entry(next_chunk, list, list) { + + if (chunk_iter && (((void *)chunk_iter + sizeof(struct kmalloc_header) + + chunk_iter->size) == (void *)next_chunk)) { + chunk = chunk_iter; + break; + } + + chunk_iter = next_chunk; } - if (size >= PAGE_SIZE * 4) { + if (!chunk) { + return; + } + + chunk->size += (next_chunk->size + sizeof(struct kmalloc_header)); + list_del(&next_chunk->list); + goto reiterate; +} + + +void kmalloc_consolidate_free_list(void) +{ + struct kmalloc_header *chunk, *tmp; + unsigned long irqflags = + ihk_mc_spinlock_lock(&cpu_local_var(remote_free_list_lock)); + + /* Clean up remotely deallocated chunks */ + list_for_each_entry_safe(chunk, tmp, + &cpu_local_var(remote_free_list), list) { + + list_del(&chunk->list); + ___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk); + } + + /* Free list lock ensures IRQs are disabled */ + ___kmalloc_consolidate_list(&cpu_local_var(free_list)); + + ihk_mc_spinlock_unlock(&cpu_local_var(remote_free_list_lock), irqflags); +} + +#define KMALLOC_MIN_SHIFT (5) +#define KMALLOC_MIN_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT) +#define KMALLOC_MIN_MASK (KMALLOC_MIN_SIZE - 1) + +/* Actual low-level allocation routines */ +static void *___kmalloc(int size, enum ihk_mc_ap_flag flag) +{ + struct kmalloc_header *chunk_iter; + struct kmalloc_header *chunk = NULL; + int npages; + unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save(); + + /* KMALLOC_MIN_SIZE bytes aligned size. */ + if (size & KMALLOC_MIN_MASK) { + size = ((size + KMALLOC_MIN_SIZE - 1) & ~(KMALLOC_MIN_MASK)); + } + + chunk = NULL; + /* Find a chunk that is big enough */ + list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) { + if (chunk_iter->size >= size) { + chunk = chunk_iter; + break; + } + } + +split_and_return: + /* Did we find one? */ + if (chunk) { + /* Do we need to split it? Only if there is enough space for + * another header and some actual content */ + if (chunk->size > (size + sizeof(struct kmalloc_header))) { + struct kmalloc_header *leftover; + + leftover = (struct kmalloc_header *) + ((void *)chunk + sizeof(struct kmalloc_header) + size); + ___kmalloc_init_chunk(leftover, + (chunk->size - size - sizeof(struct kmalloc_header))); + list_add(&leftover->list, &chunk->list); + chunk->size = size; + } + + list_del(&chunk->list); + cpu_restore_interrupt(kmalloc_irq_flags); + return ((void *)chunk + sizeof(struct kmalloc_header)); + } + + + /* Allocate new memory and add it to free list */ + npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1)) + >> PAGE_SHIFT; + chunk = ihk_mc_alloc_pages(npages, flag); + + if (!chunk) { + cpu_restore_interrupt(kmalloc_irq_flags); return NULL; } - u = (size + sizeof(*h) - 1) / sizeof(*h); + ___kmalloc_init_chunk(chunk, + (npages * PAGE_SIZE - sizeof(struct kmalloc_header))); + ___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk); - prev = h; - h = h->next; - - while (1) { - if (h == &v->free_list) { - req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1) - >> PAGE_SHIFT; - - h = allocate_pages(req_page, flag); - if(h == NULL) { - kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag); - return NULL; - } - h->check = 0x5a5a5a5a; - prev->next = h; - h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2; - /* Guard entry */ - p = h + h->size + 1; - p->check = 0x5a5a5a5a; - p->next = &v->free_list; - p->size = 0; - h->next = p; - } - - if (h->size >= u) { - if (h->size == u || h->size == u + 1) { - prev->next = h->next; - h->cpu_id = ihk_mc_get_processor_id(); - - return h + 1; - } else { /* Divide */ - h->size -= u + 1; - - p = h + h->size + 1; - p->check = 0x5a5a5a5a; - p->size = u; - p->cpu_id = ihk_mc_get_processor_id(); - - return p + 1; - } - } - prev = h; - h = h->next; - } + goto split_and_return; } -void ___kfree(void *ptr) +static void ___kfree(void *ptr) { - struct malloc_header *p = (struct malloc_header *)ptr; - struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id); + struct kmalloc_header *chunk = + (struct kmalloc_header*)(ptr - sizeof(struct kmalloc_header)); + unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save(); - if(p->cpu_id == ihk_mc_get_processor_id()){ - ____kfree(v, p); + /* Sanity check */ + if (chunk->front_magic != 0x5c5c5c5c || chunk->end_magic != 0x6d6d6d6d) { + kprintf("%s: memory corruption at address 0x%p\n", __FUNCTION__, ptr); + panic("panic"); } - else{ - unsigned long oldval; - unsigned long newval; - unsigned long rval; - do{ - p->next = v->remote_free_list; - oldval = (unsigned long)p->next; - newval = (unsigned long)p; - rval = atomic_cmpxchg8( - (unsigned long *)&v->remote_free_list, - oldval, newval); - }while(rval != oldval); + + /* Does this chunk belong to this CPU? */ + if (chunk->cpu_id == ihk_mc_get_processor_id()) { + + ___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk); + ___kmalloc_consolidate_list(&cpu_local_var(free_list)); } + else { + struct cpu_local_var *v = get_cpu_local_var(chunk->cpu_id); + unsigned long irqflags; + + irqflags = ihk_mc_spinlock_lock(&v->remote_free_list_lock); + list_add(&chunk->list, &v->remote_free_list); + ihk_mc_spinlock_unlock(&v->remote_free_list_lock, irqflags); + } + + cpu_restore_interrupt(kmalloc_irq_flags); } -void print_free_list(void) + +void ___kmalloc_print_free_list(struct list_head *list) { - struct cpu_local_var *v = get_this_cpu_local_var(); - struct malloc_header *h = &v->free_list; + struct kmalloc_header *chunk_iter; + unsigned long irqflags = kprintf_lock(); - h = h->next; - - kprintf("free_list : \n"); - while (h != &v->free_list) { - kprintf(" %p : %p, %d ->\n", h, h->next, h->size); - h = h->next; + __kprintf("%s: [ \n", __FUNCTION__); + list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) { + __kprintf("%s: 0x%lx:%d (VA PFN: %lu, off: %lu)\n", __FUNCTION__, + (unsigned long)chunk_iter, + chunk_iter->size, + (unsigned long)chunk_iter >> PAGE_SHIFT, + (unsigned long)chunk_iter % PAGE_SIZE); } - kprintf("\n"); + __kprintf("%s: ] \n", __FUNCTION__); + kprintf_unlock(irqflags); } + diff --git a/kernel/process.c b/kernel/process.c index a6f730f3..c8aad00b 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -53,7 +53,6 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm); extern void release_fp_regs(struct thread *proc); extern void save_fp_regs(struct thread *proc); extern void restore_fp_regs(struct thread *proc); -void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid); extern void __runq_add_proc(struct thread *proc, int cpu_id); extern void terminate_host(int pid); extern void lapic_timer_enable(unsigned int clocks); @@ -745,7 +744,7 @@ int join_process_memory_range(struct process_vm *vm, memobj_release(merging->memobj); } list_del(&merging->list); - ihk_mc_free(merging); + kfree(merging); error = 0; out: @@ -841,8 +840,9 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) if (range->memobj) { memobj_release(range->memobj); } + list_del(&range->list); - ihk_mc_free(range); + kfree(range); dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n", vm, start0, end0); @@ -968,7 +968,6 @@ enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fa return attr; } -/* XXX: インデントを揃える必要がある */ int add_process_memory_range(struct process_vm *vm, unsigned long start, unsigned long end, unsigned long phys, unsigned long flag, @@ -1539,6 +1538,8 @@ retry: kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); goto out; } + dkprintf("%s: clearing 0x%lx:%lu\n", + __FUNCTION__, pgaddr, pgsize); memset(virt, 0, pgsize); phys = virt_to_phys(virt); page_map(phys_to_page(phys)); @@ -1571,6 +1572,8 @@ retry: kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate copy page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error); goto out; } + dkprintf("%s: copying 0x%lx:%lu\n", + __FUNCTION__, pgaddr, pgsize); memcpy(virt, phys_to_virt(phys), pgsize); phys = virt_to_phys(virt); @@ -1651,7 +1654,7 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui "access denied. %d\n", ihk_mc_get_processor_id(), vm, fault_addr0, reason, error); - kprintf("%s: reason: %s%s%s%s%s%s%s%s\n", __FUNCTION__, + kprintf("%s: reason: %s%s%s%s%s%s%s\n", __FUNCTION__, (reason & PF_PROT) ? "PF_PROT " : "", (reason & PF_WRITE) ? "PF_WRITE " : "", (reason & PF_USER) ? "PF_USER " : "", @@ -1890,14 +1893,14 @@ unsigned long extend_process_region(struct process_vm *vm, aligned_end = (aligned_end + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; /* Fill in the gap between old_aligned_end and aligned_end * with regular pages */ - if((p = allocate_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT, + if((p = ihk_mc_alloc_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT)) == NULL){ return end; } if((rc = add_process_memory_range(vm, old_aligned_end, aligned_end, virt_to_phys(p), flag, LARGE_PAGE_SHIFT)) != 0){ - free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT); + ihk_mc_free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT); return end; } @@ -1910,7 +1913,7 @@ unsigned long extend_process_region(struct process_vm *vm, (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; address = aligned_new_end; - if((p = allocate_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT, + if((p = ihk_mc_alloc_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT)) == NULL){ return end; } @@ -1918,16 +1921,16 @@ unsigned long extend_process_region(struct process_vm *vm, p_aligned = ((unsigned long)p + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK; if (p_aligned > (unsigned long)p) { - free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT); + ihk_mc_free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT); } - free_pages( + ihk_mc_free_pages( (void *)(p_aligned + aligned_new_end - aligned_end), (LARGE_PAGE_SIZE - (p_aligned - (unsigned long)p)) >> PAGE_SHIFT); if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end, virt_to_phys((void *)p_aligned), flag, LARGE_PAGE_SHIFT)) != 0){ - free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT); + ihk_mc_free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT); return end; } @@ -1945,7 +1948,7 @@ unsigned long extend_process_region(struct process_vm *vm, p=0; }else{ - p = allocate_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT); + p = ihk_mc_alloc_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT); if (!p) { return end; @@ -1954,7 +1957,7 @@ unsigned long extend_process_region(struct process_vm *vm, if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end, (p==0?0:virt_to_phys(p)), flag, NULL, 0, PAGE_SHIFT)) != 0){ - free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT); + ihk_mc_free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT); return end; } @@ -2067,6 +2070,7 @@ release_process(struct process *proc) mcs_rwlock_writer_unlock(&parent->children_lock, &lock); } + if (proc->tids) kfree(proc->tids); kfree(proc); } @@ -2172,6 +2176,23 @@ release_sigcommon(struct sig_common *sigcommon) kfree(sigcommon); } +/* + * Release the TID from the process' TID set corresponding to this thread. + * NOTE: threads_lock must be held. + */ +void __release_tid(struct process *proc, struct thread *thread) { + int i; + + for (i = 0; i < proc->nr_tids; ++i) { + if (proc->tids[i].thread != thread) continue; + + proc->tids[i].thread = NULL; + dkprintf("%s: tid %d has been released by %p\n", + __FUNCTION__, thread->tid, thread); + break; + } +} + void destroy_thread(struct thread *thread) { struct sig_pending *pending; @@ -2188,6 +2209,7 @@ void destroy_thread(struct thread *thread) mcs_rwlock_writer_lock(&proc->threads_lock, &lock); list_del(&thread->siblings_list); + __release_tid(proc, thread); mcs_rwlock_writer_unlock(&proc->threads_lock, &lock); cpu_clear(thread->cpu_id, &thread->vm->address_space->cpu_set, @@ -2325,6 +2347,8 @@ static void idle(void) } if (v->status == CPU_STATUS_IDLE || v->status == CPU_STATUS_RESERVED) { + /* No work to do? Consolidate the kmalloc free list */ + kmalloc_consolidate_free_list(); cpu_safe_halt(); } else { @@ -2527,7 +2551,7 @@ static void do_migrate(void) v->flags |= CPU_FLAG_NEED_RESCHED; ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1); double_rq_unlock(cur_v, v, irqstate); - settid(req->thread, 2, cpu_id, old_cpu_id); + //settid(req->thread, 2, cpu_id, old_cpu_id, 0, NULL); ack: waitq_wakeup(&req->wq); @@ -2563,13 +2587,8 @@ void schedule(void) struct thread *last; if (cpu_local_var(no_preempt)) { - dkprintf("no schedule() while no preemption! \n"); - return; - } - - if (cpu_local_var(current) - && cpu_local_var(current)->in_syscall_offload) { - dkprintf("no schedule() while syscall offload!\n"); + kprintf("%s: WARNING can't schedule() while no preemption, cnt: %d\n", + __FUNCTION__, cpu_local_var(no_preempt)); return; } diff --git a/kernel/syscall.c b/kernel/syscall.c index 9ee92dec..46c09a6b 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -127,11 +127,9 @@ int prepare_process_ranges_args_envs(struct thread *thread, static void do_mod_exit(int status); #endif -static void send_syscall(struct syscall_request *req, int cpu, int pid) +static void send_syscall(struct syscall_request *req, int cpu, int pid, struct syscall_response *res) { - struct ikc_scd_packet packet; - struct syscall_response *res; - struct syscall_params *scp; + struct ikc_scd_packet packet IHK_DMA_ALIGN; struct ihk_ikc_channel_desc *syscall_channel; int ret; @@ -140,7 +138,6 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid) req->number == __NR_kill){ // interrupt syscall extern int num_processors; - scp = &get_cpu_local_var(0)->scp2; syscall_channel = get_cpu_local_var(0)->syscall_channel2; /* XXX: is this really going to work if multiple processes @@ -152,34 +149,22 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid) pid = req->args[1]; } else{ - scp = &get_cpu_local_var(cpu)->scp; syscall_channel = get_cpu_local_var(cpu)->syscall_channel; } - res = scp->response_va; res->status = 0; req->valid = 0; -#ifdef USE_DMA - memcpy_async(scp->request_pa, - virt_to_phys(req), sizeof(*req), 0, &fin); - - memcpy_async_wait(&scp->post_fin); - scp->post_va->v[0] = scp->post_idx; - memcpy_async_wait(&fin); -#else - memcpy(scp->request_va, req, sizeof(*req)); -#endif + memcpy(&packet.req, req, sizeof(*req)); barrier(); - scp->request_va->valid = 1; - *(unsigned int *)scp->doorbell_va = cpu + 1; + packet.req.valid = 1; #ifdef SYSCALL_BY_IKC packet.msg = SCD_MSG_SYSCALL_ONESIDE; packet.ref = cpu; packet.pid = pid ? pid : cpu_local_var(current)->proc->pid; - packet.arg = scp->request_rpa; + packet.resp_pa = virt_to_phys(res); dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid); ret = ihk_ikc_send(syscall_channel, &packet, 0); @@ -193,9 +178,8 @@ ihk_spinlock_t syscall_lock; long do_syscall(struct syscall_request *req, int cpu, int pid) { - struct syscall_response *res; + struct syscall_response res; struct syscall_request req2 IHK_DMA_ALIGN; - struct syscall_params *scp; int error; long rc; int islock = 0; @@ -206,6 +190,9 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) dkprintf("SC(%d)[%3d] sending syscall\n", ihk_mc_get_processor_id(), req->number); + + irqstate = 0; /* for avoidance of warning */ + barrier(); if(req->number != __NR_exit_group){ if(proc->nohost && // host is down @@ -215,20 +202,18 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) ++thread->in_syscall_offload; } - irqstate = 0; /* for avoidance of warning */ if(req->number == __NR_exit_group || req->number == __NR_gettid || req->number == __NR_kill){ // interrupt syscall - scp = &get_cpu_local_var(0)->scp2; islock = 1; irqstate = ihk_mc_spinlock_lock(&syscall_lock); } - else{ - scp = &get_cpu_local_var(cpu)->scp; - } - res = scp->response_va; - - send_syscall(req, cpu, pid); + /* The current thread is the requester and any thread from + * the pool may serve the request */ + req->rtid = cpu_local_var(current)->tid; + req->ttid = 0; + res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING; + send_syscall(req, cpu, pid, &res); dkprintf("%s: syscall num: %d waiting for Linux.. \n", __FUNCTION__, req->number); @@ -236,60 +221,83 @@ long do_syscall(struct syscall_request *req, int cpu, int pid) #define STATUS_IN_PROGRESS 0 #define STATUS_COMPLETED 1 #define STATUS_PAGE_FAULT 3 - while (res->status != STATUS_COMPLETED) { - while (res->status == STATUS_IN_PROGRESS) { + while (res.status != STATUS_COMPLETED) { + while (res.status == STATUS_IN_PROGRESS) { struct cpu_local_var *v; - int call_schedule = 0; + int do_schedule = 0; long runq_irqstate; + unsigned long flags; + DECLARE_WAITQ_ENTRY(scd_wq_entry, cpu_local_var(current)); cpu_pause(); - /* XXX: Intel MPI + Intel OpenMP situation: - * While the MPI helper thread waits in a poll() call the OpenMP master - * thread is iterating through the CPU cores using setaffinity(). - * Unless we give a chance to it on this core the two threads seem to - * hang in deadlock. If the new thread would make a system call on this - * core we would be in trouble. For now, allow it, but in the future - * we should have syscall channels for each thread instead of per core, - * or we should multiplex syscall threads in mcexec */ + /* Spin if not preemptable */ + if (cpu_local_var(no_preempt) || !thread->tid) { + continue; + } + + /* Spin by default, but if re-schedule is requested let + * the other thread run */ runq_irqstate = ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock)); v = get_this_cpu_local_var(); if (v->flags & CPU_FLAG_NEED_RESCHED) { - call_schedule = 1; - --thread->in_syscall_offload; + do_schedule = 1; } ihk_mc_spinlock_unlock(&v->runq_lock, runq_irqstate); - if (call_schedule) { - schedule(); - ++thread->in_syscall_offload; + if (!do_schedule) { + continue; } + + flags = cpu_disable_interrupt_save(); + + /* Try to sleep until notified */ + if (__sync_bool_compare_and_swap(&res.req_thread_status, + IHK_SCD_REQ_THREAD_SPINNING, + IHK_SCD_REQ_THREAD_DESCHEDULED)) { + + dkprintf("%s: tid %d waiting for syscall reply...\n", + __FUNCTION__, thread->tid); + waitq_init(&thread->scd_wq); + waitq_prepare_to_wait(&thread->scd_wq, &scd_wq_entry, + PS_INTERRUPTIBLE); + cpu_restore_interrupt(flags); + schedule(); + waitq_finish_wait(&thread->scd_wq, &scd_wq_entry); + } + + cpu_restore_interrupt(flags); } - - if (res->status == STATUS_PAGE_FAULT) { + + if (res.status == STATUS_PAGE_FAULT) { dkprintf("STATUS_PAGE_FAULT in syscall, pid: %d\n", cpu_local_var(current)->proc->pid); error = page_fault_process_vm(thread->vm, - (void *)res->fault_address, - res->fault_reason|PF_POPULATE); + (void *)res.fault_address, + res.fault_reason|PF_POPULATE); /* send result */ req2.number = __NR_mmap; #define PAGER_RESUME_PAGE_FAULT 0x0101 req2.args[0] = PAGER_RESUME_PAGE_FAULT; req2.args[1] = error; + /* The current thread is the requester and only the waiting thread + * may serve the request */ + req2.rtid = cpu_local_var(current)->tid; + req2.ttid = res.stid; - send_syscall(&req2, cpu, pid); + res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING; + send_syscall(&req2, cpu, pid, &res); } } dkprintf("%s: syscall num: %d got host reply: %d \n", - __FUNCTION__, req->number, res->ret); + __FUNCTION__, req->number, res.ret); - rc = res->ret; + rc = res.ret; if(islock){ ihk_mc_spinlock_unlock(&syscall_lock, irqstate); } @@ -820,7 +828,8 @@ terminate(int rc, int sig) release_thread(mythread); release_process_vm(vm); schedule(); - // no return + kprintf("%s: ERROR: returned from terminate() -> schedule()\n", __FUNCTION__); + panic("panic"); } void @@ -838,14 +847,15 @@ terminate_host(int pid) } void -interrupt_syscall(int pid, int cpuid) +interrupt_syscall(int pid, int tid) { - dkprintf("interrupt_syscall,target pid=%d,target cpuid=%d\n", pid, cpuid); + dkprintf("interrupt_syscall,target pid=%d,target tid=%d\n", pid, tid); ihk_mc_user_context_t ctx; long lerror; +kprintf("interrupt_syscall pid=%d tid=%d\n", pid, tid); ihk_mc_syscall_arg0(&ctx) = pid; - ihk_mc_syscall_arg1(&ctx) = cpuid; + ihk_mc_syscall_arg1(&ctx) = tid; lerror = syscall_generic_forwarding(__NR_kill, &ctx); if (lerror) { @@ -908,8 +918,6 @@ static int do_munmap(void *addr, size_t len) begin_free_pages_pending(); error = remove_process_memory_range(cpu_local_var(current)->vm, (intptr_t)addr, (intptr_t)addr+len, &ro_freed); - // XXX: TLB flush - flush_tlb(); if (error || !ro_freed) { clear_host_pte((uintptr_t)addr, len); } @@ -921,6 +929,8 @@ static int do_munmap(void *addr, size_t len) } } finish_free_pages_pending(); + dkprintf("%s: 0x%lx:%lu, error: %ld\n", + __FUNCTION__, addr, len, error); return error; } @@ -1068,25 +1078,18 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, vrflags |= PROT_TO_VR_FLAG(prot); vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0; vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0; + vrflags |= VR_DEMAND_PAGING; if (flags & MAP_ANONYMOUS) { - if (0) { - /* dummy */ + if (!anon_on_demand) { + populated_mapping = 1; } #ifdef USE_NOCACHE_MMAP #define X_MAP_NOCACHE MAP_32BIT else if (flags & X_MAP_NOCACHE) { + vrflags &= ~VR_DEMAND_PAGING; vrflags |= VR_IO_NOCACHE; } #endif - else { - vrflags |= VR_DEMAND_PAGING; - if (!anon_on_demand) { - populated_mapping = 1; - } - } - } - else { - vrflags |= VR_DEMAND_PAGING; } if (flags & (MAP_POPULATE | MAP_LOCKED)) { @@ -1162,6 +1165,8 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, error = -ENOMEM; goto out; } + dkprintf("%s: 0x%x:%lu allocated %d pages, p2align: %lx\n", + __FUNCTION__, addr, len, npages, p2align); phys = virt_to_phys(p); } else if (flags & MAP_SHARED) { @@ -1197,10 +1202,10 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot, error = add_process_memory_range(thread->vm, addr, addr+len, phys, vrflags, memobj, off, pgshift); if (error) { - ekprintf("do_mmap:add_process_memory_range" - "(%p,%lx,%lx,%lx,%lx,%d) failed %d\n", - thread->vm, addr, addr+len, - virt_to_phys(p), vrflags, pgshift, error); + kprintf("%s: add_process_memory_range failed for 0x%lx:%lu" + " flags: %lx, vrflags: %lx, pgshift: %d, error: %d\n", + __FUNCTION__, addr, addr+len, + flags, vrflags, pgshift, error); goto out; } @@ -1246,8 +1251,12 @@ out: if (memobj) { memobj_release(memobj); } - dkprintf("do_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", - addr0, len0, prot, flags, fd, off0, error, addr); + dkprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, " + "fd: %d, off: %lu, error: %ld, addr: 0x%lx\n", + __FUNCTION__, + addr, len, addr0, len0, prot, flags, + fd, off0, error, addr); + return (!error)? addr: error; } @@ -1478,8 +1487,8 @@ SYSCALL_DECLARE(getppid) return thread->proc->ppid_parent->pid; } -void -settid(struct thread *thread, int mode, int newcpuid, int oldcpuid) +void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid, + int nr_tids, int *tids) { struct syscall_request request IHK_DMA_ALIGN; unsigned long rc; @@ -1489,6 +1498,12 @@ settid(struct thread *thread, int mode, int newcpuid, int oldcpuid) request.args[1] = thread->proc->pid; request.args[2] = newcpuid; request.args[3] = oldcpuid; + /* + * If nr_tids is non-zero, tids should point to an array of ints + * where the thread ids of the mcexec process are expected. + */ + request.args[4] = nr_tids; + request.args[5] = virt_to_phys(tids); rc = do_syscall(&request, ihk_mc_get_processor_id(), thread->proc->pid); if (mode != 2) { thread->tid = rc; @@ -1893,7 +1908,61 @@ unsigned long do_fork(int clone_flags, unsigned long newsp, &new->vm->address_space->cpu_set_lock); if (clone_flags & CLONE_VM) { - settid(new, 1, cpuid, -1); + int *tids = NULL; + int i; + struct mcs_rwlock_node_irqsave lock; + + mcs_rwlock_writer_lock(&newproc->threads_lock, &lock); + /* Obtain mcexec TIDs if not known yet */ + if (!newproc->nr_tids) { + tids = kmalloc(sizeof(int) * num_processors, IHK_MC_AP_NOWAIT); + if (!tids) { + mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock); + release_cpuid(cpuid); + return -ENOMEM; + } + + newproc->tids = kmalloc(sizeof(struct mcexec_tid) * num_processors, IHK_MC_AP_NOWAIT); + if (!newproc->tids) { + mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock); + kfree(tids); + release_cpuid(cpuid); + return -ENOMEM; + } + + settid(new, 1, cpuid, -1, num_processors, tids); + + for (i = 0; (i < num_processors) && tids[i]; ++i) { + dkprintf("%s: tid[%d]: %d\n", __FUNCTION__, i, tids[i]); + newproc->tids[i].tid = tids[i]; + newproc->tids[i].thread = NULL; + ++newproc->nr_tids; + } + + kfree(tids); + } + + /* Find an unused TID */ +retry_tid: + for (i = 0; i < newproc->nr_tids; ++i) { + if (!newproc->tids[i].thread) { + if (!__sync_bool_compare_and_swap( + &newproc->tids[i].thread, NULL, new)) { + goto retry_tid; + } + new->tid = newproc->tids[i].tid; + dkprintf("%s: tid %d assigned to %p\n", __FUNCTION__, new->tid, new); + break; + } + } + + /* TODO: spawn more mcexec threads */ + if (!new->tid) { + kprintf("%s: no more TIDs available\n"); + panic(""); + } + + mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock); } /* fork() a new process on the host */ else { @@ -1913,7 +1982,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp, } /* In a single threaded process TID equals to PID */ - settid(new, 0, cpuid, -1); + new->tid = newproc->pid; new->vm->address_space->pids[0] = new->proc->pid; dkprintf("fork(): new pid: %d\n", new->proc->pid); @@ -5712,6 +5781,10 @@ SYSCALL_DECLARE(sched_setaffinity) int empty_set = 1; extern int num_processors; + if (!u_cpu_set) { + return -EINVAL; + } + if (sizeof(k_cpu_set) > len) { memset(&k_cpu_set, 0, sizeof(k_cpu_set)); } @@ -5719,7 +5792,7 @@ SYSCALL_DECLARE(sched_setaffinity) len = MIN2(len, sizeof(k_cpu_set)); if (copy_from_user(&k_cpu_set, u_cpu_set, len)) { - kprintf("%s: error: copy_from_user failed for %p:%d\n", __FUNCTION__, u_cpu_set, len); + dkprintf("%s: error: copy_from_user failed for %p:%d\n", __FUNCTION__, u_cpu_set, len); return -EFAULT; } diff --git a/kernel/sysfs.c b/kernel/sysfs.c index a924e531..dc97e58d 100644 --- a/kernel/sysfs.c +++ b/kernel/sysfs.c @@ -75,7 +75,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode, dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n", ops, instance, mode, fmt); - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_createf:allocate_pages failed. %d\n", error); @@ -134,7 +134,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode, error = 0; out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n", @@ -156,7 +156,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...) dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt); - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error); @@ -208,7 +208,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...) out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error); @@ -229,7 +229,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...) dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt); - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error); @@ -279,7 +279,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...) error = 0; out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n", @@ -301,7 +301,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...) dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt); - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error); @@ -353,7 +353,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...) out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error); @@ -374,7 +374,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...) dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt); - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error); @@ -423,7 +423,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...) error = 0; out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error); @@ -601,14 +601,14 @@ sysfs_init(void) } sysfs_data_bufsize = PAGE_SIZE; - sysfs_data_buf = allocate_pages(1, IHK_MC_AP_NOWAIT); + sysfs_data_buf = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!sysfs_data_buf) { error = -ENOMEM; ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error); goto out; } - param = allocate_pages(1, IHK_MC_AP_NOWAIT); + param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); if (!param) { error = -ENOMEM; ekprintf("sysfs_init:allocate_pages(param) failed. %d\n", @@ -644,7 +644,7 @@ sysfs_init(void) error = 0; out: if (param) { - free_pages(param, 1); + ihk_mc_free_pages(param, 1); } if (error) { ekprintf("sysfs_init(): %d\n", error); diff --git a/kernel/zeroobj.c b/kernel/zeroobj.c index c8fc5c2a..a70a89f2 100644 --- a/kernel/zeroobj.c +++ b/kernel/zeroobj.c @@ -172,6 +172,10 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align, struct zeroobj *obj = to_zeroobj(memobj); struct page *page; + /* Don't bother about zero page, page fault handler will + * allocate and clear pages */ + return 0; + dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n", memobj, off, p2align, physp); if (off & ~PAGE_MASK) { diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index cf2957a0..52a3c554 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -103,7 +103,7 @@ void ihk_mc_clean_micpa(void); void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag); void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag); void ihk_mc_free_pages(void *p, int npages); -void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag); +void *ihk_mc_allocate(int size, int flag); void ihk_mc_free(void *p); void *arch_alloc_page(enum ihk_mc_ap_flag flag);