From abe57218c4889a8387c2bc6607080b06c551a727 Mon Sep 17 00:00:00 2001 From: NAKAMURA Gou Date: Wed, 17 Jul 2013 16:19:39 +0900 Subject: [PATCH] trial implementation of private file mapping for review only. will soon be reverted. --- arch/x86/kernel/cpu.c | 7 +- arch/x86/kernel/include/registers.h | 2 + arch/x86/kernel/interrupt.S | 4 +- arch/x86/kernel/memory.c | 381 ++++++++++++++++++++++++- executer/include/uprotocol.h | 2 + executer/kernel/control.c | 60 +++- executer/kernel/ikc.c | 5 + executer/kernel/mcctrl.h | 1 + executer/kernel/syscall.c | 420 +++++++++++++++++++++++++++- executer/user/mcexec.c | 33 +-- kernel/Makefile.build | 2 +- kernel/Makefile.build.dcfa | 2 +- kernel/host.c | 14 +- kernel/include/memobj.h | 22 ++ kernel/include/page.h | 19 +- kernel/include/pager.h | 26 ++ kernel/include/process.h | 17 +- kernel/include/syscall.h | 3 + kernel/mem.c | 181 +++++++----- kernel/memobj.c | 221 +++++++++++++++ kernel/process.c | 386 ++++++++++++++++++++++++- kernel/syscall.c | 111 ++++++-- lib/include/ihk/mm.h | 10 +- 23 files changed, 1756 insertions(+), 173 deletions(-) create mode 100644 kernel/include/memobj.h create mode 100644 kernel/include/pager.h create mode 100644 kernel/memobj.c diff --git a/arch/x86/kernel/cpu.c b/arch/x86/kernel/cpu.c index a649f950..cd36a7ba 100644 --- a/arch/x86/kernel/cpu.c +++ b/arch/x86/kernel/cpu.c @@ -289,6 +289,11 @@ void init_syscall(void) void init_cpu(void) { + asm volatile ( + "mov %%cr0,%%rax;" + "or $0x10000,%%rax;" + "mov %%rax,%%cr0" + ::: "%rax"); init_fpu(); init_lapic(); init_syscall(); @@ -494,7 +499,7 @@ int ihk_mc_unregister_interrupt_handler(int vector, extern unsigned long __page_fault_handler_address; -void ihk_mc_set_page_fault_handler(void (*h)(unsigned long, void *)) +void ihk_mc_set_page_fault_handler(void (*h)(unsigned long, unsigned long, void *)) { __page_fault_handler_address = (unsigned long)h; } diff --git a/arch/x86/kernel/include/registers.h b/arch/x86/kernel/include/registers.h index 80716dcb..d06e4bed 100644 --- a/arch/x86/kernel/include/registers.h +++ b/arch/x86/kernel/include/registers.h @@ -128,6 +128,8 @@ struct x86_regs { unsigned long error, rip, cs, rflags, rsp, ss; }; +#define REGS_GET_STACK_POINTER(regs) (((struct x86_regs *)regs)->rsp) + /* * Page fault error code bits: * diff --git a/arch/x86/kernel/interrupt.S b/arch/x86/kernel/interrupt.S index 88b2d3b4..7745af1d 100644 --- a/arch/x86/kernel/interrupt.S +++ b/arch/x86/kernel/interrupt.S @@ -66,8 +66,8 @@ page_fault: cld PUSH_ALL_REGS movq %cr2, %rdi - movq %rsp, %rsi - movq %rbp, %rdx + movq 80(%rsp),%rsi + movq %rsp, %rdx movq __page_fault_handler_address(%rip), %rax andq %rax, %rax jz 1f diff --git a/arch/x86/kernel/memory.c b/arch/x86/kernel/memory.c index 0c8cfc4e..3aa0045f 100644 --- a/arch/x86/kernel/memory.c +++ b/arch/x86/kernel/memory.c @@ -7,6 +7,7 @@ #include #include #include +#include #define ekprintf(...) kprintf(__VA_ARGS__) @@ -796,15 +797,20 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base, uint64_t star { struct clear_range_args *args = args0; uint64_t phys; + struct page *page; if (*ptep == PTE_NULL) { return -ENOENT; } phys = *ptep & PT_PHYSMASK; - *ptep = 0; + *ptep = PTE_NULL; if (args->free_physical) { + page = phys_to_page(phys); + if (page && (page->mode == PM_MAPPED) && !page_unmap(page)) { + return 0; + } ihk_mc_free_pages(phys_to_virt(phys), 1); } @@ -817,6 +823,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t star uint64_t phys; struct page_table *pt; int error; + struct page *page; if (*ptep == PTE_NULL) { return -ENOENT; @@ -841,6 +848,15 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t star *ptep = PTE_NULL; if (args->free_physical) { + page = phys_to_page(phys); + if (page && (page->mode == PM_MAPPED)) { + if (--page->count > 0) { + /* other mapping exists */ + return 0; + } + list_del(&page->list); + page->mode = PM_NONE; + } ihk_mc_free_pages(phys_to_virt(phys), LARGE_PAGE_SIZE/PAGE_SIZE); } @@ -1133,6 +1149,369 @@ int ihk_mc_pt_alloc_range(page_table_t pt, void *start, void *end, &alloc_range_l4, &attr); } +static int lookup_pte(struct page_table *pt, uintptr_t virt, pte_t **ptepp, + uintptr_t *pgbasep, size_t *pgsizep) +{ + int l4idx, l3idx, l2idx, l1idx; + + GET_VIRT_INDICES(virt, l4idx, l3idx, l2idx, l1idx); + + if (!(pt->entry[l4idx] & PFL4_PRESENT)) { + return -ENOENT; + } + + pt = phys_to_virt(pt->entry[l4idx] & PT_PHYSMASK); + if (!(pt->entry[l3idx] & PFL3_PRESENT)) { + return -ENOENT; + } + + pt = phys_to_virt(pt->entry[l3idx] & PT_PHYSMASK); + if ((pt->entry[l2idx] == PTE_NULL) + || (pt->entry[l2idx] & PFL2_SIZE)) { + *ptepp = &pt->entry[l2idx]; + *pgbasep = GET_INDICES_VIRT(l4idx, l3idx, l2idx, 0); + *pgsizep = PTL2_SIZE; + return 0; + } + + pt = phys_to_virt(pt->entry[l2idx] & PT_PHYSMASK); + *ptepp = &pt->entry[l1idx]; + *pgbasep = GET_INDICES_VIRT(l4idx, l3idx, l2idx, l1idx); + *pgsizep = PTL1_SIZE; + + return 0; +} + +int ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, pte_t **ptepp, void **pgbasep, size_t *pgsizep) +{ + int error; + pte_t *ptep = NULL; + uintptr_t pgbase = 0; + size_t pgsize = 0; + + kprintf("ihk_mc_pt_lookup_pte(%p,%p)\n", pt, virt); + error = lookup_pte(pt, (uintptr_t)virt, &ptep, &pgbase, &pgsize); + if (error) { + kprintf("ihk_mc_pt_lookup_pte(%p,%p):lookup failed. %d\n", pt, virt, error); + goto out; + } + + error = 0; + *ptepp = ptep; + *pgbasep = (void *)pgbase; + *pgsizep = pgsize; + +out: + kprintf("ihk_mc_pt_lookup_pte(%p,%p): %d %p %lx %lx\n", pt, virt, error, ptep, pgbase, pgsize); + return error; +} + +static int page_p2align_list[] = { + LARGE_PAGE_P2ALIGN, + PAGE_P2ALIGN, + -1, +}; + +int ihk_mc_pt_choose_pagesize(page_table_t pt, void *start0, void *end0, + void *fault_addr0, size_t maxpgsize, void **pgaddrp, + size_t *pgsizep, int *p2alignp) +{ + const uintptr_t start = (uintptr_t)start0; + const uintptr_t end = (uintptr_t)end0; + const uintptr_t fault_addr = (uintptr_t)fault_addr0; + int ix; + int p2align; + size_t pgsize; + uintptr_t pgbase; + pte_t *ptep; + int error; + uintptr_t pga; + size_t pgs; + + kprintf("ihk_mc_pt_choose_pagesize(%p,%p,%p,%p,%lx,%p,%p,%p)\n", + pt, start0, end0, fault_addr0, maxpgsize, pgaddrp, + pgsizep, p2alignp); + + if ((fault_addr < start) || (end <= fault_addr)) { + kprintf("ihk_mc_pt_choose_pagesize(%p,%p,%p,%p,%lx,%p,%p,%p):" + "out of range\n", + pt, start0, end0, fault_addr0, maxpgsize, + pgaddrp, pgsizep, p2alignp); + panic("ihk_mc_pt_choose_pagesize:out of range"); + } + + pgs = 0; + for (ix = 0; page_p2align_list[ix] >= 0; ++ix) { + p2align = page_p2align_list[ix]; + pgsize = PAGE_SIZE << p2align; + pgbase = fault_addr & ~(pgsize - 1); + if ((maxpgsize != 0) && (pgsize > maxpgsize)) { + continue; + } + if ((pgbase < start) || (end < (pgbase + pgsize))) { + continue; + } + if (pgs == 0) { + error = lookup_pte(pt, fault_addr, &ptep, &pga, &pgs); + if (error == -ENOENT) { + error = 0; + pgs = LARGE_PAGE_SIZE; + pga = fault_addr & LARGE_PAGE_MASK; + } + else if (error) { + kprintf("ihk_mc_pt_choose_pagesize(" + "%p,%p,%p,%p,%lx,%p,%p,%p):" + "lookup pte failed. %d\n", + pt, start0, end0, fault_addr0, + maxpgsize, pgaddrp, pgsizep, + p2alignp, error); + goto out; + } + } + if (pgs < pgsize) { + continue; + } + + error = 0; + *pgaddrp = (void *)pgbase; + *pgsizep = pgsize; + *p2alignp = p2align; + goto out; + } + + kprintf("ihk_mc_pt_choose_pagesize(%p,%p,%p,%p,%lx,%p,%p,%p):" + "not reached\n", + pt, start0, end0, fault_addr0, maxpgsize, pgaddrp, + pgsizep, p2alignp); + panic("ihk_mc_pt_choose_pagesize:not reached"); + +out: + kprintf("ihk_mc_pt_choose_pagesize(%p,%p,%p,%p,%lx,%p,%p,%p):" + " %d %p %lx %d\n", + pt, start0, end0, fault_addr0, maxpgsize, pgaddrp, + pgsizep, p2alignp, error, *pgaddrp, *pgsizep, *p2alignp); + return error; +} + +struct set_range_args { + uintptr_t phys; + enum ihk_mc_pt_attribute attr; + int padding; + uintptr_t diff; +}; + +int set_range_l1(void *args0, pte_t *ptep, uint64_t base, uint64_t start, + uint64_t end) +{ + struct set_range_args *args = args0; + int error; + uintptr_t phys; + + kprintf("set_range_l1(%p,%p,%lx,%lx,%lx)\n", + args0, ptep, base, start, end); + + if (*ptep != PTE_NULL) { + kprintf("set_range_l1(%p,%p,%lx,%lx,%lx):page exists\n", + args0, ptep, base, start, end); + error = -EBUSY; + goto out; + } + + phys = args->phys + (base - start); + *ptep = phys | attr_to_l1attr(args->attr); + + error = 0; +out: + kprintf("set_range_l1(%p,%p,%lx,%lx,%lx): %d\n", + args0, ptep, base, start, end, error); + return error; +} + +int set_range_l2(void *args0, pte_t *ptep, uint64_t base, uint64_t start, + uint64_t end) +{ + struct set_range_args *args = args0; + uintptr_t phys; + int error; + struct page_table *pt; + + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx)\n", + args0, ptep, base, start, end); + + if (*ptep == PTE_NULL) { + if ((start <= base) && ((base + PTL2_SIZE) <= end) + && ((args->diff & (PTL2_SIZE - 1)) == 0)) { + phys = args->phys + (base - start); + *ptep = phys | attr_to_l2attr(args->attr|PTATTR_LARGEPAGE); + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx):" + "large page\n", + args0, ptep, base, start, end); + error = 0; + goto out; + } + + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (pt == NULL) { + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx):" + "__alloc_new_pt failed\n", + args0, ptep, base, start, end); + error = -ENOMEM; + goto out; + } + + *ptep = virt_to_phys(pt) | PFL2_PDIR_ATTR; + } + else if (*ptep & PFL2_SIZE) { + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx):" + "page exists\n", + args0, ptep, base, start, end); + error = -EBUSY; + goto out; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l1(pt, base, start, end, &set_range_l1, args0); + if (error) { + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx):" + "walk_pte_l1 failed. %d\n", + args0, ptep, base, start, end, error); + goto out; + } + + error = 0; +out: + kprintf("set_range_l2(%p,%p,%lx,%lx,%lx): %d\n", + args0, ptep, base, start, end, error); + return error; +} + +int set_range_l3(void *args0, pte_t *ptep, uint64_t base, uint64_t start, + uint64_t end) +{ + struct page_table *pt; + int error; + + kprintf("set_range_l3(%p,%p,%lx,%lx,%lx)\n", + args0, ptep, base, start, end); + + if (*ptep == PTE_NULL) { + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (pt == NULL) { + kprintf("set_range_l3(%p,%p,%lx,%lx,%lx):" + "__alloc_new_pt failed\n", + args0, ptep, base, start, end); + return -ENOMEM; + } + *ptep = virt_to_phys(pt) | PFL3_PDIR_ATTR; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l2(pt, base, start, end, &set_range_l2, args0); + if (error) { + kprintf("set_range_l3(%p,%p,%lx,%lx,%lx):" + "walk_pte_l2 failed. %d\n", + args0, ptep, base, start, end, error); + goto out; + } + + error = 0; +out: + kprintf("set_range_l3(%p,%p,%lx,%lx,%lx): %d\n", + args0, ptep, base, start, end, error); + return error; +} + +int set_range_l4(void *args0, pte_t *ptep, uint64_t base, uint64_t start, + uint64_t end) +{ + struct page_table *pt; + int error; + + kprintf("set_range_l4(%p,%p,%lx,%lx,%lx)\n", + args0, ptep, base, start, end); + + if (*ptep == PTE_NULL) { + pt = __alloc_new_pt(IHK_MC_AP_NOWAIT); + if (pt == NULL) { + kprintf("set_range_l4(%p,%p,%lx,%lx,%lx):" + "__alloc_new_pt failed\n", + args0, ptep, base, start, end); + return -ENOMEM; + } + *ptep = virt_to_phys(pt) | PFL4_PDIR_ATTR; + } + else { + pt = phys_to_virt(*ptep & PT_PHYSMASK); + } + + error = walk_pte_l3(pt, base, start, end, &set_range_l3, args0); + if (error) { + kprintf("set_range_l4(%p,%p,%lx,%lx,%lx):" + "walk_pte_l3 failed. %d\n", + args0, ptep, base, start, end, error); + goto out; + } + + error = 0; +out: + kprintf("set_range_l4(%p,%p,%lx,%lx,%lx): %d\n", + args0, ptep, base, start, end, error); + return error; +} + +int ihk_mc_pt_set_range(page_table_t pt, void *start, void *end, + uintptr_t phys, enum ihk_mc_pt_attribute attr) +{ + int error; + struct set_range_args args; + + kprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x)\n", + pt, start, end, phys, attr); + + args.phys = phys; + args.attr = attr; + args.diff = (uintptr_t)start ^ phys; + + error = walk_pte_l4(pt, 0, (uintptr_t)start, (uintptr_t)end, + &set_range_l4, &args); + if (error) { + kprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x):" + "walk_pte_l4 failed. %d\n", + pt, start, end, phys, attr, error); + goto out; + } + + error = 0; +out: + kprintf("ihk_mc_pt_set_range(%p,%p,%p,%lx,%x): %d\n", + pt, start, end, phys, attr, error); + return error; +} + +int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, uintptr_t phys, size_t pgsize, enum ihk_mc_pt_attribute attr) +{ + kprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x):\n", + pt, ptep, phys, pgsize, attr); + switch (pgsize) { + case PTL1_SIZE: + *ptep = phys | attr_to_l1attr(attr); + break; + case PTL2_SIZE: + *ptep = phys | attr_to_l2attr(attr | PTATTR_LARGEPAGE); + break; + default: + kprintf("ihk_mc_pt_set_pte(%p,%p,%lx,%lx,%x):\n", + pt, ptep, phys, pgsize, attr); + panic("ihk_mc_pt_set_pte"); + break; + } + return 0; +} + void load_page_table(struct page_table *pt) { unsigned long pt_addr; diff --git a/executer/include/uprotocol.h b/executer/include/uprotocol.h index 67c2fff1..e96a96e6 100644 --- a/executer/include/uprotocol.h +++ b/executer/include/uprotocol.h @@ -71,6 +71,8 @@ struct syscall_load_desc { struct syscall_response { unsigned long status; long ret; + unsigned long fault_address; + unsigned long fault_reason; }; struct syscall_ret_desc { diff --git a/executer/kernel/control.c b/executer/kernel/control.c index 60df3cda..0889ecec 100644 --- a/executer/kernel/control.c +++ b/executer/kernel/control.c @@ -217,9 +217,9 @@ int mcexec_syscall(struct mcctrl_channel *c, unsigned long arg) return 0; } -#ifndef DO_USER_MODE int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc); +#ifndef DO_USER_MODE // static int remaining_job, base_cpu, job_pos; #endif @@ -243,10 +243,24 @@ int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req) if(swd.cpu >= usrdata->num_channels)return -EINVAL; c = usrdata->channels + swd.cpu; + if ((usrdata->channelowners[swd.cpu] != NULL) + && (usrdata->channelowners[swd.cpu] != current)) { + printk("mcexec_wait_syscall:double wait %p %p\n", + usrdata->channelowners[swd.cpu], + current); + return -EBUSY; + } #ifdef DO_USER_MODE - wait_event_interruptible(c->wq_syscall, c->req); +retry: + if (wait_event_interruptible(c->wq_syscall, c->req)) { + return -EINTR; + } c->req = 0; + if (!c->param.request_va->valid) { +printk("mcexec_wait_syscall:stray wakeup\n"); + goto retry; + } #else while (1) { c = usrdata->channels + swd.cpu; @@ -285,22 +299,28 @@ if(swd.cpu >= usrdata->num_channels)return -EINVAL; } if (c->param.request_va && c->param.request_va->valid) { +#endif c->param.request_va->valid = 0; /* ack */ dprintk("SC #%lx, %lx\n", c->param.request_va->number, c->param.request_va->args[0]); - if (__do_in_kernel_syscall(os, c, c->param.request_va)) { + usrdata->channelowners[swd.cpu] = current; + if (__do_in_kernel_syscall(os, c, c->param.request_va)) { + if (copy_to_user(&req->sr, c->param.request_va, + sizeof(struct syscall_request))) { + usrdata->channelowners[swd.cpu] = NULL; + return -EFAULT; + } + return 0; + } + usrdata->channelowners[swd.cpu] = NULL; +#ifdef DO_USER_MODE + goto retry; #endif - if (copy_to_user(&req->sr, c->param.request_va, - sizeof(struct syscall_request))) { - return -EFAULT; - } #ifndef DO_USER_MODE - return 0; - } - if (usrdata->mcctrl_dma_abort) { - return -2; - } + if (usrdata->mcctrl_dma_abort) { + return -2; + } } } usrdata->remaining_job = 0; @@ -438,6 +458,13 @@ long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg) if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) { return -EFAULT; } + if (usrdata->channelowners[ret.cpu] != current) { + printk("mcexec_ret_syscall:owner mismatch: %p %p\n", + usrdata->channelowners[ret.cpu], + current); + return -EBUSY; + } + usrdata->channelowners[ret.cpu] = NULL; mc = usrdata->channels + ret.cpu; if (!mc) { return -EINVAL; @@ -488,6 +515,15 @@ long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg) } else { mc->param.response_va->status = 1; } +#if 1 + { + extern struct vm_area_struct *rus_vma; + + if (zap_vma_ptes(rus_vma, rus_vma->vm_start, rus_vma->vm_end - rus_vma->vm_start)) { + printk("zap_vma_ptes failed\n"); + } + } +#endif return 0; } diff --git a/executer/kernel/ikc.c b/executer/kernel/ikc.c index 4f1dbed1..5597eb5f 100644 --- a/executer/kernel/ikc.c +++ b/executer/kernel/ikc.c @@ -235,6 +235,11 @@ int prepare_ikc_channels(ihk_os_t os) printk("Error: cannot allocate channels.\n"); return -ENOMEM; } + usrdata->channelowners = kzalloc(sizeof(void *) * usrdata->num_channels, GFP_KERNEL); + if (usrdata->channelowners == NULL) { + printk("Error: cannot allocate channelowners.\n"); + return -ENOMEM; + } usrdata->os = os; init_waitqueue_head(&usrdata->wq_prepare); diff --git a/executer/kernel/mcctrl.h b/executer/kernel/mcctrl.h index 88d7a3ff..64fb7cad 100644 --- a/executer/kernel/mcctrl.h +++ b/executer/kernel/mcctrl.h @@ -80,6 +80,7 @@ struct mcctrl_usrdata { unsigned long last_thread_exec; wait_queue_head_t wq_prepare; unsigned long rpgtable; /* per process, not per OS */ + void **channelowners; }; int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp); diff --git a/executer/kernel/syscall.c b/executer/kernel/syscall.c index 5be7a396..5313146e 100644 --- a/executer/kernel/syscall.c +++ b/executer/kernel/syscall.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -37,7 +38,7 @@ static void print_dma_lastreq(void) #endif #if 1 /* x86 depend, host OS side */ -unsigned long translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva) +unsigned long translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva, unsigned fflags) { unsigned long rpa; int offsh; @@ -63,6 +64,13 @@ unsigned long translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long return -EFAULT; } +#define PTE_RW 0x002 + if ((fflags & FAULT_FLAG_WRITE) && !(pt[ix] & PTE_RW)) { + ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE); + ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE); + return -EFAULT; + } + #define PTE_PS 0x080 if (pt[ix] & PTE_PS) { rpa = pt[ix] & ((1UL << 52) - 1) & ~((1UL << offsh) - 1); @@ -84,6 +92,64 @@ out: } #endif +static int pager_call(ihk_os_t os, struct syscall_request *req); +static int remote_page_fault(struct mcctrl_usrdata *usrdata, struct vm_fault *vmf) +{ + int cpu; + struct mcctrl_channel *channel; + volatile struct syscall_request *req; + volatile struct syscall_response *resp; + + printk("remote_page_fault(%p,%p %x)\n", usrdata, vmf->virtual_address, vmf->flags); + /* get peer cpu */ + for (cpu = 0; cpu < usrdata->num_channels; ++cpu) { + if (usrdata->channelowners[cpu] == current) { + break; + } + } + if (cpu >= usrdata->num_channels) { + printk("cpu not found\n"); + return -ENOENT; + } + + channel = &usrdata->channels[cpu]; + req = channel->param.request_va; + resp = channel->param.response_va; + + /* request page fault */ + resp->ret = -EFAULT; + resp->fault_address = (unsigned long)vmf->virtual_address; + resp->fault_reason = (vmf->flags & FAULT_FLAG_WRITE)? 1: 0; + + req->valid = 0; + resp->status = 3; + +retry: + /* wait for response */ + while (req->valid == 0) { + schedule(); + } + req->valid = 0; + + /* check result */ + if (req->number != __NR_mmap) { + printk("remote_page_fault:invalid response. %lx %lx\n", + req->number, req->args[0]); + return -EIO; + } + else if (req->args[0] != 0x0101) { + resp->ret = pager_call(usrdata->os, (void *)req); + resp->status = 1; + goto retry; + } + else if (req->args[1] != 0) { + printk("remote_page_fault:response %d\n", (int)req->args[1]); + return (int)req->args[1]; + } + printk("remote_page_fault(%p,%p %x): 0\n", usrdata, vmf->virtual_address, vmf->flags); + return 0; +} + static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct mcctrl_usrdata * usrdata = vma->vm_file->private_data; @@ -91,12 +157,26 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long rpa; unsigned long phys; int error; + int try; dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); - rpa = translate_rva_to_rpa(usrdata->os, usrdata->rpgtable, - (unsigned long)vmf->virtual_address); + for (try = 1; ; ++try) { + rpa = translate_rva_to_rpa(usrdata->os, usrdata->rpgtable, + (unsigned long)vmf->virtual_address, + vmf->flags); +#define NTRIES 2 + if (((long)rpa >= 0) || (try >= NTRIES)) { + break; + } + + error = remote_page_fault(usrdata, vmf); + if (error) { + printk("forward_page_fault failed. %d\n", error); + break; + } + } if ((long)rpa < 0) { printk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n", vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page); @@ -130,6 +210,7 @@ static struct file_operations rus_fops = { .mmap = &rus_mmap, }; +struct vm_area_struct *rus_vma = NULL; int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp) { struct file *file; @@ -152,6 +233,7 @@ int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, un } start = do_mmap_pgoff(file, 0, end, PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0); + vma = find_vma(current->mm, 0); up_write(¤t->mm->mmap_sem); fput(file); if (IS_ERR_VALUE(start)) { @@ -159,6 +241,7 @@ int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, un return start; } + rus_vma = vma; *startp = start; *endp = end; return 0; @@ -250,12 +333,6 @@ static void clear_wait(unsigned char *p, int size) p[size] = 0; } -static void __return_syscall(struct mcctrl_channel *c, int ret) -{ - c->param.response_va->ret = ret; - c->param.response_va->status = 1; -} - static unsigned long translate_remote_va(struct mcctrl_channel *c, unsigned long rva) { @@ -282,6 +359,7 @@ static unsigned long translate_remote_va(struct mcctrl_channel *c, //extern struct mcctrl_channel *channels; +#if 0 int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc) { @@ -397,4 +475,328 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, } } } +#endif #endif /* !DO_USER_MODE */ + +static void __return_syscall(struct mcctrl_channel *c, long ret) +{ + c->param.response_va->ret = ret; + c->param.response_va->status = 1; +} + +struct pager { + struct list_head list; + struct inode * inode; + void * handle; +}; + +/* + * for linux v2.6.35 or prior + */ +#ifndef DEFINE_SEMAPHORE +#define DEFINE_SEMAPHORE(...) DECLARE_MUTEX(__VA_ARGS__) +#endif + +static DEFINE_SEMAPHORE(pager_sem); +static struct list_head pager_list = LIST_HEAD_INIT(pager_list); + +struct pager_create_result { + uintptr_t handle; + int maxprot; +}; + +static int pager_req_create(ihk_os_t os, int fd, int flags, int prot, uintptr_t result_pa) +{ + const int ignore_flags = MAP_FIXED | MAP_DENYWRITE; + const int ok_flags = MAP_PRIVATE; + ihk_device_t dev = ihk_os_to_dev(os); + int error; + void *handle = NULL; + struct pager_create_result *resp; + int maxprot = -1; + struct file *file = NULL; + struct inode *inode; + struct pager *pager; + uintptr_t phys; + + printk("pager_req_create(%d,%x,%x,%lx)\n", fd, flags, prot, (long)result_pa); + + if (flags & ~(ignore_flags | ok_flags)) { + printk("pager_req_create(%d,%x,%x,%lx):not supported flags %x\n", + fd, flags, prot, (long)result_pa, + flags & ~(ignore_flags | ok_flags)); + error = -EINVAL; + goto out; + } + + file = fget(fd); + if (file == NULL) { + error = -EBADF; + printk("pager_req_create(%d,%x,%x,%lx):file not found. %d\n", fd, flags, prot, (long)result_pa, error); + goto out; + } + + inode = file->f_path.dentry->d_inode; + if (inode == NULL) { + error = -EBADF; + printk("pager_req_create(%d,%x,%x,%lx):inode not found. %d\n", fd, flags, prot, (long)result_pa, error); + goto out; + } + + if (!(file->f_mode & (FMODE_READ | FMODE_WRITE))) { + maxprot = PROT_NONE; + } + else { + maxprot = 0; + if (file->f_mode & FMODE_READ) { + maxprot |= PROT_READ; + maxprot |= PROT_EXEC; + } + if (file->f_mode & FMODE_WRITE) { + maxprot |= PROT_WRITE; + } + } + + error = down_interruptible(&pager_sem); + if (error) { + error = -EINTR; + printk("pager_req_create(%d,%x,%x,%lx):signaled. %d\n", fd, flags, prot, (long)result_pa, error); + goto out; + } + + list_for_each_entry(pager, &pager_list, list) { + if (pager->inode == inode) { + handle = pager->handle; + error = -EALREADY; + up(&pager_sem); + goto found; + } + } + + pager = kzalloc(sizeof(*pager), GFP_KERNEL); + if (pager == NULL) { + error = -ENOMEM; + printk("pager_req_create(%d,%x,%x,%lx):kzalloc failed. %d\n", fd, flags, prot, (long)result_pa, error); + up(&pager_sem); + goto out; + } + + down_write(¤t->mm->mmap_sem); + handle = (void *)do_mmap_pgoff(file, 0, PAGE_SIZE, prot, (flags & ok_flags), 0); + up_write(¤t->mm->mmap_sem); + if (IS_ERR(handle)) { + error = PTR_ERR(handle); + printk("pager_req_create(%d,%x,%x,%lx):mmap failed. %d\n", + fd, flags, prot, (long)result_pa, error); + kfree(pager); + up(&pager_sem); + goto out; + } + + pager->inode = inode; + pager->handle = handle; + list_add(&pager->list, &pager_list); + up(&pager_sem); + + error = 0; +found: + phys = ihk_device_map_memory(dev, result_pa, sizeof(*resp)); + resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0); + resp->handle = (uintptr_t)handle; + resp->maxprot = maxprot; + ihk_device_unmap_virtual(dev, resp, sizeof(*resp)); + ihk_device_unmap_memory(dev, phys, sizeof(*resp)); + +out: + if (file != NULL) { + fput(file); + } + printk("pager_req_create(%d,%x,%x,%lx): %d %p %x\n", + fd, flags, prot, (long)result_pa, error, handle, maxprot); + return error; +} + +static int pager_req_release(ihk_os_t os, uintptr_t handle) +{ + struct vm_area_struct *vma; + int error; + struct pager *pager; + struct pager *next; + + printk("pager_req_relase(%p,%lx)\n", os, handle); + + error = down_interruptible(&pager_sem); + if (error) { + printk("pager_req_relase(%p,%lx):signaled. %d\n", os, handle, error); + down_write(¤t->mm->mmap_sem); + goto out; + } + + list_for_each_entry_safe(pager, next, &pager_list, list) { + if ((uintptr_t)pager->handle == handle) { + list_del(&pager->list); + up(&pager_sem); + kfree(pager); + goto found; + } + } + up(&pager_sem); + + error = -EBADF; + printk("pager_req_relase(%p,%lx):pager not found. %d\n", os, handle, error); + down_write(¤t->mm->mmap_sem); + goto out; + +found: + down_write(¤t->mm->mmap_sem); + vma = find_vma(current->mm, handle); + if (vma == 0) { + error = -EBADF; + printk("pager_req_relase(%p,%lx):vma not found. %d\n", os, handle, error); + goto out; + } + if ((vma->vm_start != handle) || (vma->vm_end != (handle + PAGE_SIZE))) { + error = -EBADF; + printk("pager_req_relase(%p,%lx):invalid vma. %d\n", os, handle, error); + goto out; + } + if (vma->vm_file == NULL) { + error = -EBADF; + printk("pager_req_relase(%p,%lx):file not found. %d\n", os, handle, error); + goto out; + } + + error = do_munmap(current->mm, handle, PAGE_SIZE); + if (error) { + printk("pager_req_relase(%p,%lx):do_munmap failed. %d\n", os, handle, error); + goto out; + } + + error = 0; +out: + up_write(¤t->mm->mmap_sem); + printk("pager_req_relase(%p,%lx): %d\n", os, handle, error); + return error; +} + +static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size, uintptr_t rpa) +{ + ihk_device_t dev = ihk_os_to_dev(os); + struct vm_area_struct *vma; + int error; + struct file *file; + uintptr_t phys; + void *buf; + mm_segment_t fs; + loff_t pos; + ssize_t ss; + + printk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa); + + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, handle); + if (vma == 0) { + error = -EBADF; + printk("pager_req_read(%lx,%lx,%lx,%lx):vma not found. %d\n", handle, off, size, rpa, error); + up_read(¤t->mm->mmap_sem); + goto out; + } + if ((vma->vm_start != handle) || (vma->vm_end != (handle + PAGE_SIZE))) { + error = -EBADF; + printk("pager_req_read(%lx,%lx,%lx,%lx):invalid vma. %d\n", handle, off, size, rpa, error); + up_read(¤t->mm->mmap_sem); + goto out; + } + file = vma->vm_file; + if (file == NULL) { + error = -EBADF; + printk("pager_req_read(%lx,%lx,%lx,%lx):file not found. %d\n", handle, off, size, rpa, error); + up_read(¤t->mm->mmap_sem); + goto out; + } + get_file(file); + up_read(¤t->mm->mmap_sem); + + phys = ihk_device_map_memory(dev, rpa, size); + buf = ihk_device_map_virtual(dev, phys, size, NULL, 0); + fs = get_fs(); + set_fs(KERNEL_DS); + pos = off; + ss = vfs_read(file, buf, size, &pos); + if ((ss >= 0) && (ss != size)) { + if (clear_user(buf+ss, size-ss) == 0) { + ss = size; + } + else { + ss = -EIO; + } + } + set_fs(fs); + ihk_device_unmap_virtual(dev, buf, size); + ihk_device_unmap_memory(dev, phys, size); + fput(file); + if (ss < 0) { + error = ss; + printk("pager_req_read(%lx,%lx,%lx,%lx):pread failed. %d\n", handle, off, size, rpa, error); + goto out; + } + error = 0; +out: + printk("pager_req_read(%lx,%lx,%lx,%lx): %d\n", handle, off, size, rpa, error); + return error; +} + +static int pager_call(ihk_os_t os, struct syscall_request *req) +{ + int error; + + printk("pager_call(%p %#lx)\n", req, req->args[0]); + switch (req->args[0]) { +#define PAGER_REQ_CREATE 0x0001 +#define PAGER_REQ_RELEASE 0x0002 +#define PAGER_REQ_READ 0x0003 + case PAGER_REQ_CREATE: + error = pager_req_create(os, req->args[1], req->args[2], req->args[3], req->args[4]); + break; + + case PAGER_REQ_RELEASE: + error = pager_req_release(os, req->args[1]); + break; + + case PAGER_REQ_READ: + error = pager_req_read(os, req->args[1], req->args[2], req->args[3], req->args[4]); + break; + + default: + error = -ENOSYS; + break; + } + + printk("pager_call(%p %#lx): %d\n", req, req->args[0], error); + return error; +} + +int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc) +{ + int error; + long ret; + + printk("__do_in_kernel_syscall(%p,%p,%p %ld)\n", os, c, sc, sc->number); + switch (sc->number) { + case __NR_mmap: + ret = pager_call(os, sc); + break; + + default: + error = -ENOSYS; + goto out; + break; + } + + __return_syscall(c, ret); + + error = 0; +out: + printk("__do_in_kernel_syscall(%p,%p,%p %ld): %d\n", os, c, sc, sc->number, error); + return error; +} diff --git a/executer/user/mcexec.c b/executer/user/mcexec.c index c0ff8cc9..4ed7edf5 100644 --- a/executer/user/mcexec.c +++ b/executer/user/mcexec.c @@ -635,33 +635,12 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock) pthread_mutex_unlock(lock); return w.sr.args[0]; - case __NR_mmap: { - // w.sr.args[0] is converted to MIC physical address - __dprintf("mcexec.c,mmap,MIC-paddr=%lx,len=%lx,prot=%lx,flags=%lx,fd=%lx,offset=%lx\n", - w.sr.args[0], w.sr.args[1], w.sr.args[2], w.sr.args[3], w.sr.args[4], w.sr.args[5]); - off_t old_off = lseek(w.sr.args[4], 0, SEEK_CUR); - if(old_off == -1) { __dprint("mcexec.c,mmap,lseek failed\n"); ret = -errno; goto mmap_out; } - off_t rlseek = lseek(w.sr.args[4], w.sr.args[5], SEEK_SET); - if(rlseek == -1) { __dprint("mcexec.c,mmap,lseek failed\n"); ret = -errno; goto mmap_out; } - ssize_t toread = w.sr.args[1]; - ret = 0; - while(toread > 0) { - __dprintf("mcexec.c,mmap,read,addr=%lx,len=%lx\n", (long int)((void *)dma_buf + w.sr.args[1] - toread), toread); - ssize_t rread = read(w.sr.args[4], (void *)dma_buf + w.sr.args[1] - toread, toread); - if(rread == 0) { - __dprint("mcexec.c,mmap,read==0\n"); - goto mmap_zero_out; - } else if(rread < 0) { - __dprint("mcexec.c,mmap,read failed\n"); ret = -errno; break; - } - toread -= rread; - } - mmap_zero_out: - rlseek = lseek(w.sr.args[4], old_off, SEEK_SET); - if(rlseek == -1) { __dprint("mcexec.c,mmap,lseek failed\n"); ret = -errno; } - mmap_out: - do_syscall_return(fd, cpu, ret, 1, (unsigned long)dma_buf, w.sr.args[0], w.sr.args[1]); - break; } + case __NR_mmap: + case __NR_munmap: + case __NR_mprotect: + /* reserved for internal use */ + do_syscall_return(fd, cpu, -ENOSYS, 0, 0, 0, 0); + break; #ifdef USE_SYSCALL_MOD_CALL case 303:{ diff --git a/kernel/Makefile.build b/kernel/Makefile.build index 36bf10b2..8c0629cf 100644 --- a/kernel/Makefile.build +++ b/kernel/Makefile.build @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o plist.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o memobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/Makefile.build.dcfa b/kernel/Makefile.build.dcfa index 580eeb89..db8f89a7 100644 --- a/kernel/Makefile.build.dcfa +++ b/kernel/Makefile.build.dcfa @@ -1,6 +1,6 @@ IHKDIR=$(IHKBASE)/$(TARGETDIR) OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o -OBJS += process.o copy.o waitq.o futex.o timer.o plist.o +OBJS += process.o copy.o waitq.o futex.o timer.o plist.o memobj.o DEPSRCS=$(wildcard $(SRC)/*.c) CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ diff --git a/kernel/host.c b/kernel/host.c index 08ea5809..0327ac36 100644 --- a/kernel/host.c +++ b/kernel/host.c @@ -92,12 +92,13 @@ static int process_msg_prepare_process(unsigned long rphys) range_npages = (e - s) >> PAGE_SHIFT; flags = VR_NONE; flags |= PROT_TO_VR_FLAG(pn->sections[i].prot); + flags |= VRFLAG_PROT_TO_MAXPROT(flags); if((up_v = ihk_mc_alloc_pages(range_npages, IHK_MC_AP_NOWAIT)) == NULL){ goto err; } up = virt_to_phys(up_v); - if(add_process_memory_range(proc, s, e, up, flags) != 0){ + if(add_process_memory_range(proc, s, e, up, flags, NULL, 0) != 0){ ihk_mc_free_pages(up_v, range_npages); goto err; } @@ -168,29 +169,32 @@ static int process_msg_prepare_process(unsigned long rphys) /* Map system call stuffs */ flags = VR_RESERVED | VR_PROT_READ | VR_PROT_WRITE; + flags |= VRFLAG_PROT_TO_MAXPROT(flags); addr = proc->vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT; e = addr + PAGE_SIZE * DOORBELL_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).doorbell_pa, - VR_REMOTE | flags) != 0){ + VR_REMOTE | flags, NULL, 0) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * REQUEST_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).request_pa, - VR_REMOTE | flags) != 0){ + VR_REMOTE | flags, NULL, 0) != 0){ goto err; } addr = e; e = addr + PAGE_SIZE * RESPONSE_PAGE_COUNT; if(add_process_memory_range(proc, addr, e, cpu_local_var(scp).response_pa, - flags) != 0){ + flags, NULL, 0) != 0){ goto err; } /* Map, copy and update args and envs */ + flags = VR_PROT_READ | VR_PROT_WRITE; + flags |= VRFLAG_PROT_TO_MAXPROT(flags); addr = e; e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT; @@ -200,7 +204,7 @@ static int process_msg_prepare_process(unsigned long rphys) args_envs_p = virt_to_phys(args_envs); if(add_process_memory_range(proc, addr, e, args_envs_p, - VR_PROT_READ|VR_PROT_WRITE) != 0){ + flags, NULL, 0) != 0){ ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT); goto err; } diff --git a/kernel/include/memobj.h b/kernel/include/memobj.h new file mode 100644 index 00000000..f4e15b4f --- /dev/null +++ b/kernel/include/memobj.h @@ -0,0 +1,22 @@ +#ifndef HEADER_MEMOBJ_H +#define HEADER_MEMOBJ_H + +#include +#include +#include +#include + +struct memobj { + struct list_head list; + ihk_atomic_t ref; + uintptr_t handle; + struct list_head page_list; + ihk_spinlock_t page_list_lock; +}; + +int memobj_create(int fd, int flags, int prot, struct memobj **objp, int *maxprotp); +void memobj_ref(struct memobj *obj); +void memobj_release(struct memobj *obj); +int memobj_get_page(struct memobj *obj, off_t off, size_t pgsize, uintptr_t *physp); + +#endif /* HEADER_MEMOBJ_H */ diff --git a/kernel/include/page.h b/kernel/include/page.h index 6d5aaaaa..d06d4d48 100644 --- a/kernel/include/page.h +++ b/kernel/include/page.h @@ -2,16 +2,25 @@ #define __HEADER_PAGE_H struct page { - struct list_head list; - uint64_t flags; - int64_t count; + struct list_head list; + uint8_t mode; + uint8_t padding[3]; + int32_t count; + off_t offset; }; -/* flags */ -#define PAGE_IN_LIST 0x0001UL +/* mode */ +enum page_mode { + PM_NONE = 0x00, + PM_PENDING_FREE = 0x01, + PM_PAGEIO = 0x02, + PM_MAPPED = 0x03, + PM_ANON_COW = 0x04, +}; struct page *phys_to_page(uintptr_t phys); uintptr_t page_to_phys(struct page *page); +int page_unmap(struct page *page); void *allocate_pages(int npages, enum ihk_mc_ap_flag flag); void free_pages(void *va, int npages); diff --git a/kernel/include/pager.h b/kernel/include/pager.h new file mode 100644 index 00000000..840edcda --- /dev/null +++ b/kernel/include/pager.h @@ -0,0 +1,26 @@ +#ifndef HEADER_PAGER_H +#define HEADER_PAGER_H + +#include + +enum pager_op { + PAGER_REQ_CREATE = 0x0001, + PAGER_REQ_RELEASE = 0x0002, + PAGER_REQ_READ = 0x0003, +}; + +/* + * int pager_req_create(int fd, int flags, int prot, uintptr_t result_rpa); + */ +struct pager_create_result { + uintptr_t handle; + int maxprot; +}; + +/* + * int pager_req_release(uintptr_t handle); + */ +/* + * int pager_req_read(uintptr_t handle, off_t off, size_t size, uintptr_t buf_rpa); + */ +#endif /* HEADER_PAGER_H */ diff --git a/kernel/include/process.h b/kernel/include/process.h index 5788f840..ad0f43bc 100644 --- a/kernel/include/process.h +++ b/kernel/include/process.h @@ -7,6 +7,7 @@ #include #include #include +#include #define VR_NONE 0x0 #define VR_STACK 0x1 @@ -14,13 +15,21 @@ #define VR_IO_NOCACHE 0x100 #define VR_REMOTE 0x200 #define VR_DEMAND_PAGING 0x1000 +#define VR_PRIVATE 0x2000 #define VR_PROT_NONE 0x00000000 #define VR_PROT_READ 0x00010000 #define VR_PROT_WRITE 0x00020000 #define VR_PROT_EXEC 0x00040000 #define VR_PROT_MASK 0x00070000 +#define VR_MAXPROT_NONE 0x00000000 +#define VR_MAXPROT_READ 0x00100000 +#define VR_MAXPROT_WRITE 0x00200000 +#define VR_MAXPROT_EXEC 0x00400000 +#define VR_MAXPROT_MASK 0x00700000 #define PROT_TO_VR_FLAG(prot) (((unsigned long)(prot) << 16) & VR_PROT_MASK) +#define VRFLAG_PROT_TO_MAXPROT(vrflag) (((vrflag) & VR_PROT_MASK) << 4) +#define VRFLAG_MAXPROT_TO_PROT(vrflag) (((vrflag) & VR_MAXPROT_MASK) >> 4) #define PS_RUNNING 0x1 #define PS_INTERRUPTIBLE 0x2 @@ -41,6 +50,8 @@ struct vm_range { struct list_head list; unsigned long start, end; unsigned long flag; + struct memobj *memobj; + off_t objoff; }; struct vm_regions { @@ -106,7 +117,6 @@ struct process_vm { // is protected by its own lock (see ihk/manycore/generic/page_alloc.c) }; - struct process *create_process(unsigned long user_pc); struct process *clone_process(struct process *org, unsigned long pc, unsigned long sp); @@ -114,10 +124,12 @@ void destroy_process(struct process *proc); void hold_process(struct process *proc); void free_process(struct process *proc); void free_process_memory(struct process *proc); +void flush_process_memory(struct process *proc); int add_process_memory_range(struct process *process, unsigned long start, unsigned long end, - unsigned long phys, unsigned long flag); + unsigned long phys, unsigned long flag, + struct memobj *memobj, off_t objoff); int remove_process_memory_range( struct process *process, unsigned long start, unsigned long end); int split_process_memory_range(struct process *process, @@ -133,6 +145,7 @@ struct vm_range *next_process_memory_range( struct process_vm *vm, struct vm_range *range); struct vm_range *previous_process_memory_range( struct process_vm *vm, struct vm_range *range); +int page_fault_process_memory_range(struct process *proc, struct vm_range *range, uintptr_t fault_addr, uint64_t reason); int remove_process_region(struct process *proc, unsigned long start, unsigned long end); struct program_load_desc; diff --git a/kernel/include/syscall.h b/kernel/include/syscall.h index 2701489c..7a3b76a5 100644 --- a/kernel/include/syscall.h +++ b/kernel/include/syscall.h @@ -126,6 +126,8 @@ struct syscall_request { struct syscall_response { unsigned long status; long ret; + unsigned long fault_address; + unsigned long fault_reason; }; struct syscall_post { @@ -190,6 +192,7 @@ struct syscall_params { extern int do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx); extern int obtain_clone_cpuid(); +extern long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx); #define DECLARATOR(number,name) __NR_##name = number, #define SYSCALL_HANDLED(number,name) DECLARATOR(number,name) diff --git a/kernel/mem.c b/kernel/mem.c index 401d38ee..37be9be1 100644 --- a/kernel/mem.c +++ b/kernel/mem.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,12 +69,12 @@ void free_pages(void *va, int npages) struct list_head *pendings = &cpu_local_var(pending_free_pages); struct page *page; + page = phys_to_page(virt_to_phys(va)); + if (page->mode != PM_NONE) { + panic("free_pages:not PM_NONE"); + } if (pendings->next != NULL) { - page = phys_to_page(virt_to_phys(va)); - if (page->flags & PAGE_IN_LIST) { - panic("free_pages"); - } - page->flags |= PAGE_IN_LIST; + page->mode = PM_PENDING_FREE; page->count = npages; list_add_tail(&page->list, pendings); return; @@ -103,10 +104,10 @@ void finish_free_pages_pending(void) } list_for_each_entry_safe(page, next, pendings, list) { - if (!(page->flags & PAGE_IN_LIST)) { - panic("free_pending_pages"); + if (page->mode != PM_PENDING_FREE) { + panic("free_pending_pages:not PM_PENDING_FREE"); } - page->flags &= ~PAGE_IN_LIST; + page->mode = PM_NONE; list_del(&page->list); ihk_pagealloc_free(pa_allocator, page_to_phys(page), page->count); } @@ -143,72 +144,39 @@ static struct ihk_mc_interrupt_handler query_free_mem_handler = { void sigsegv(void *); -static void page_fault_handler(unsigned long address, void *regs, - unsigned long rbp) +static void unhandled_page_fault(struct process *proc, unsigned long address, void *regs) { - struct vm_range *range, *next; - char found = 0; + struct process_vm *vm = proc->vm; + struct vm_range *range; + char found; int irqflags; unsigned long error = ((struct x86_regs *)regs)->error; irqflags = kprintf_lock(); - __kprintf("[%d] Page fault for 0x%lX, (rbp: 0x%lX)\n", - ihk_mc_get_processor_id(), address, rbp); + __kprintf("[%d] Page fault for 0x%lX\n", + ihk_mc_get_processor_id(), address); + __kprintf("%s for %s access in %s mode (reserved bit %s set), " + "it %s an instruction fetch\n", + (error & PF_PROT ? "protection fault" : "no page found"), + (error & PF_WRITE ? "write" : "read"), + (error & PF_USER ? "user" : "kernel"), + (error & PF_RSVD ? "was" : "wasn't"), + (error & PF_INSTR ? "was" : "wasn't")); - __kprintf("%s for %s access in %s mode (reserved bit %s set), it %s an instruction fetch\n", - (error & PF_PROT ? "protection fault" : "no page found"), - (error & PF_WRITE ? "write" : "read"), - (error & PF_USER ? "user" : "kernel"), - (error & PF_RSVD ? "was" : "wasn't"), - (error & PF_INSTR ? "was" : "wasn't")); - - list_for_each_entry_safe(range, next, - &cpu_local_var(current)->vm->vm_range_list, - list) { - + found = 0; + list_for_each_entry(range, &vm->vm_range_list, list) { if (range->start <= address && range->end > address) { - __kprintf("address is in range, flag: 0x%X! \n", range->flag); - if(range->flag & VR_DEMAND_PAGING){ - //allocate page for demand paging - __kprintf("demand paging\n"); - void* pa = allocate_pages(1, IHK_MC_AP_CRITICAL); - if(!pa){ - kprintf_unlock(irqflags); - panic("allocate_pages failed"); - } - __kprintf("physical memory area obtained %lx\n", virt_to_phys(pa)); - - { - enum ihk_mc_pt_attribute flag = 0; - struct process *process = cpu_local_var(current); - unsigned long flags = ihk_mc_spinlock_lock(&process->vm->page_table_lock); - const enum ihk_mc_pt_attribute attr = flag | PTATTR_WRITABLE | PTATTR_USER | PTATTR_FOR_USER; - - int rc = ihk_mc_pt_set_page(process->vm->page_table, (void*)(address & PAGE_MASK), virt_to_phys(pa), attr); - if(rc != 0) { - ihk_mc_spinlock_unlock(&process->vm->page_table_lock, flags); - __kprintf("ihk_mc_pt_set_page failed,rc=%d,%p,%lx,%08x\n", rc, (void*)(address & PAGE_MASK), virt_to_phys(pa), attr); - ihk_mc_pt_print_pte(process->vm->page_table, (void*)address); - goto fn_fail; - } - ihk_mc_spinlock_unlock(&process->vm->page_table_lock, flags); - __kprintf("update_process_page_table success\n"); - } - kprintf_unlock(irqflags); - memset(pa, 0, PAGE_SIZE); - return; - } found = 1; - ihk_mc_pt_print_pte(cpu_local_var(current)->vm->page_table, - (void*)address); + __kprintf("address is in range, flag: 0x%X! \n", + range->flag); + ihk_mc_pt_print_pte(vm->page_table, (void*)address); break; } } - - if (!found) + if (!found) { __kprintf("address is out of range! \n"); + } - fn_fail: kprintf_unlock(irqflags); /* TODO */ @@ -216,19 +184,72 @@ static void page_fault_handler(unsigned long address, void *regs, #ifdef DEBUG_PRINT_MEM { - const struct x86_regs *_regs = regs; - dkprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n", - *((unsigned long*)_regs->rsp), - *((unsigned long*)_regs->rsp+8), - *((unsigned long*)_regs->rsp+16), - *((unsigned long*)_regs->rsp+24) - ); + uint64_t *sp = (void *)REGS_GET_STACK_POINTER(regs); + + kprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n", + sp[0], sp[1], sp[2], sp[3]); } #endif +#if 0 + panic("mem fault"); +#endif sigsegv(regs); + return; +} - //panic("mem fault"); +static void page_fault_handler(unsigned long address, unsigned long reason, void *regs) +{ + struct process *proc = cpu_local_var(current); + struct process_vm *vm = proc->vm; + struct vm_range *range; + unsigned long vrflag; + unsigned long denied; + int error; + + kprintf("[%d]page_fault_handler(%lx,%lx,%p)\n", + ihk_mc_get_processor_id(), address, reason, regs); + + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + range = lookup_process_memory_range(vm, address, address+1); + if (range == NULL) { + kprintf("page_fault_handler(%lx,%lx,%p):out of range\n", + address, reason, regs); + unhandled_page_fault(proc, address, regs); + goto out; + } + + if (reason & PF_WRITE) { + vrflag = VR_PROT_WRITE; + } + else if (reason & PF_INSTR) { + vrflag = VR_PROT_EXEC; + } + else { + vrflag = VR_PROT_READ; + } + + denied = vrflag & ~range->flag; + if (denied) { + kprintf("page_fault_handler(%lx,%lx,%p):access denied. %lx\n", + address, reason, regs, denied); + unhandled_page_fault(proc, address, regs); + goto out; + } + + error = page_fault_process_memory_range(proc, range, address, reason); + if (error) { + kprintf("page_fault_handler(%lx,%lx,%p):fault range failed. %d\n", + address, reason, regs, error); + unhandled_page_fault(proc, address, regs); + goto out; + } + +out: + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + kprintf("[%d]page_fault_handler(%lx,%lx,%p):\n", + ihk_mc_get_processor_id(), address, reason, regs); + return; } static void page_allocator_init(void) @@ -289,6 +310,7 @@ static void page_allocator_init(void) &query_free_mem_handler); } +#if 1 struct page *phys_to_page(uintptr_t phys) { int64_t ix; @@ -316,6 +338,26 @@ uintptr_t page_to_phys(struct page *page) return phys; } +int page_unmap(struct page *page) +{ + kprintf("page_unmap(%p %x %d)\n", page, page->mode, page->count); + if (page->mode != PM_MAPPED) { + panic("page_unmap:not PM_MAPPED"); + } + + if (--page->count > 0) { + /* other mapping exist */ + kprintf("page_unmap(%p %x %d): 0\n", page, page->mode, page->count); + return 0; + } + + /* no mapping exist */ + list_del(&page->list); + page->mode = PM_NONE; + kprintf("page_unmap(%p %x %d): 1\n", page, page->mode, page->count); + return 1; +} + static void page_init(void) { size_t npages; @@ -330,6 +372,7 @@ static void page_init(void) memset(pa_pages, 0, allocsize); return; } +#endif void register_kmalloc(void) { diff --git a/kernel/memobj.c b/kernel/memobj.c new file mode 100644 index 00000000..824059cb --- /dev/null +++ b/kernel/memobj.c @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define dkprintf(...) kprintf(__VA_ARGS__) +#define ekprintf(...) kprintf(__VA_ARGS__) + +static ihk_spinlock_t memobj_list_lock = SPIN_LOCK_UNLOCKED; +static LIST_HEAD(memobj_list); + +int memobj_create(int fd, int flags, int prot, struct memobj **objpp, int *maxprotp) +{ + ihk_mc_user_context_t ctx; + struct pager_create_result result; + int error; + struct memobj *memobj = NULL; + struct memobj *obj; + + kprintf("memobj_create(%d,%x,%x)\n", fd, flags, prot); + memobj = kmalloc(sizeof(*memobj), IHK_MC_AP_NOWAIT); + if (memobj == NULL) { + error = -ENOMEM; + kprintf("memobj_create(%d,%x,%x):kmalloc failed. %d\n", fd, flags, prot, error); + goto out; + } + +retry: + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_CREATE; + ihk_mc_syscall_arg1(&ctx) = fd; + ihk_mc_syscall_arg2(&ctx) = flags; + ihk_mc_syscall_arg3(&ctx) = prot; + ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result); + + error = syscall_generic_forwarding(__NR_mmap, &ctx); + if (error == -EALREADY) { + kprintf("memobj_create(%d,%x,%x,%p):create failed. %d\n", + fd, flags, prot, objpp, error); + ihk_mc_spinlock_lock_noirq(&memobj_list_lock); + list_for_each_entry(obj, &memobj_list, list) { + if (obj->handle == result.handle) { + memobj_ref(obj); + ihk_mc_spinlock_unlock_noirq(&memobj_list_lock); + kfree(memobj); + memobj = obj; + goto found; + } + } + ihk_mc_spinlock_unlock_noirq(&memobj_list_lock); + goto retry; + } + else if (error) { + kprintf("memobj_create(%d,%x,%x,%p):create failed. %d\n", + fd, flags, prot, objpp, error); + goto out; + } + + memset(memobj, 0, sizeof(*memobj)); + ihk_atomic_set(&memobj->ref, 1); + memobj->handle = result.handle; + INIT_LIST_HEAD(&memobj->page_list); + ihk_mc_spinlock_init(&memobj->page_list_lock); + + ihk_mc_spinlock_lock_noirq(&memobj_list_lock); + list_add(&memobj->list, &memobj_list); + ihk_mc_spinlock_unlock_noirq(&memobj_list_lock); + +found: + error = 0; + *objpp = memobj; + *maxprotp = result.maxprot; + memobj = NULL; + +out: + kprintf("memobj_create(%d,%x,%x):%d %p %x\n", fd, flags, prot, error, *objpp, *maxprotp); + return error; +} + +void memobj_ref(struct memobj *obj) +{ + kprintf("memobj_ref(%p):\n", obj); + ihk_atomic_inc(&obj->ref); + return; +} + +void memobj_release(struct memobj *obj) +{ + ihk_mc_user_context_t ctx; + int error; + + kprintf("memobj_release(%p)\n", obj); + ihk_mc_spinlock_lock_noirq(&memobj_list_lock); + if (!ihk_atomic_dec_and_test(&obj->ref)) { + ihk_mc_spinlock_unlock_noirq(&memobj_list_lock); + kprintf("memobj_release(%p):keep\n", obj); + return; + } + list_del(&obj->list); + ihk_mc_spinlock_unlock_noirq(&memobj_list_lock); + + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_RELEASE; + ihk_mc_syscall_arg1(&ctx) = obj->handle; + + error = syscall_generic_forwarding(__NR_mmap, &ctx); + if (error) { + kprintf("memobj_release(%p):release failed. %d\n", obj, error); + /* through */ + } + + kfree(obj); + kprintf("memobj_release(%p):free\n", obj); + return; +} + +int memobj_get_page(struct memobj *obj, off_t off, size_t pgsize, uintptr_t *physp) +{ + int error; + void *virt = NULL; + uintptr_t phys = -1; + ihk_mc_user_context_t ctx; + struct page *page; + + kprintf("memobj_get_page(%p,%lx,%lx,%p)\n", obj, off, pgsize, physp); + if (pgsize != PAGE_SIZE) { + error = -ENOMEM; + goto out; + } + +retry: + for (;;) { + ihk_mc_spinlock_lock_noirq(&obj->page_list_lock); + list_for_each_entry(page, &obj->page_list, list) { + if ((page->mode != PM_PAGEIO) && (page->mode != PM_MAPPED)) { + panic("memobj_get_page:invalid obj page"); + } + if (page->offset == off) { + if (page->mode == PM_PAGEIO) { + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + goto retry; + } + ++page->count; + phys = page_to_phys(page); + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + goto found; + } + } + + if (virt != NULL) { + page = phys_to_page(phys); + break; + } + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + + virt = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); + if (virt == NULL) { + error = -ENOMEM; + goto out; + } + phys = virt_to_phys(virt); + } + + if (page->mode != PM_NONE) { + panic("memobj_get_page:invalid new page"); + } + page->mode = PM_PAGEIO; + page->offset = off; + list_add(&page->list, &obj->page_list); + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + + ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_READ; + ihk_mc_syscall_arg1(&ctx) = obj->handle; + ihk_mc_syscall_arg2(&ctx) = off; + ihk_mc_syscall_arg3(&ctx) = pgsize; + ihk_mc_syscall_arg4(&ctx) = phys; + + error = syscall_generic_forwarding(__NR_mmap, &ctx); + if (error) { + kprintf("memobj_get_page(%p,%lx,%lx,%p):read failed. %d\n", + obj, off, pgsize, physp, error); + ihk_mc_spinlock_lock_noirq(&obj->page_list_lock); + if (page->mode != PM_PAGEIO) { + panic("memobj_get_page:invalid io page"); + } + list_del(&page->list); + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + page->mode = PM_NONE; + goto out; + } + + ihk_mc_spinlock_lock_noirq(&obj->page_list_lock); + if (page->mode != PM_PAGEIO) { + panic("memobj_get_page:invalid io page"); + } + page->mode = PM_MAPPED; + page->count = 1; + ihk_mc_spinlock_unlock_noirq(&obj->page_list_lock); + virt = NULL; + +found: + error = 0; + *physp = phys; + +out: + if (virt != NULL) { + ihk_mc_free_pages(virt, 1); + } + kprintf("memobj_get_page(%p,%lx,%lx,%p): %d %lx\n", + obj, off, pgsize, physp, error, phys); + return error; +} diff --git a/kernel/process.c b/kernel/process.c index e5ca9486..5ba11ade 100644 --- a/kernel/process.c +++ b/kernel/process.c @@ -211,6 +211,16 @@ int split_process_memory_range(struct process *proc, struct vm_range *range, newrange->end = range->end; newrange->flag = range->flag; + if (range->memobj != NULL) { + memobj_ref(range->memobj); + newrange->memobj = range->memobj; + newrange->objoff = range->objoff + (addr - range->start); + } + else { + newrange->memobj = NULL; + newrange->objoff = 0; + } + range->end = addr; list_add(&newrange->list, &range->list); @@ -238,13 +248,27 @@ int join_process_memory_range(struct process *proc, merging->start, merging->end); if ((surviving->end != merging->start) - || (surviving->flag != merging->flag)) { + || (surviving->flag != merging->flag) + || (surviving->memobj != merging->memobj)) { error = -EINVAL; goto out; } + if (surviving->memobj != NULL) { + size_t len; + off_t endoff; + + len = surviving->end - surviving->start; + endoff = surviving->objoff + len; + if (endoff != merging->objoff) { + return -EINVAL; + } + } surviving->end = merging->end; + if (merging->memobj != NULL) { + memobj_release(merging->memobj); + } list_del(&merging->list); ihk_mc_free(merging); @@ -268,7 +292,7 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) intptr_t lpend; #endif /* USE_LARGE_PAGES */ - dkprintf("free_process_memory_range(%p,%lx-%lx)\n", + kprintf("free_process_memory_range(%p,%lx-%lx)\n", vm, start0, end0); start = range->start; @@ -294,10 +318,17 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) } #endif /* USE_LARGE_PAGES */ + if (range->memobj != NULL) { + ihk_mc_spinlock_lock_noirq(&range->memobj->page_list_lock); + } + ihk_mc_spinlock_lock_noirq(&vm->page_table_lock); error = ihk_mc_pt_free_range(vm->page_table, (void *)start, (void *)end); ihk_mc_spinlock_unlock_noirq(&vm->page_table_lock); + if (range->memobj != NULL) { + ihk_mc_spinlock_unlock_noirq(&range->memobj->page_list_lock); + } if (error && (error != -ENOENT)) { ekprintf("free_process_memory_range(%p,%lx-%lx):" "ihk_mc_pt_free_range(%lx-%lx) failed. %d\n", @@ -318,10 +349,13 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range) } } + if (range->memobj != NULL) { + memobj_release(range->memobj); + } list_del(&range->list); ihk_mc_free(range); - dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n", + kprintf("free_process_memory_range(%p,%lx-%lx): 0\n", vm, start0, end0); return 0; } @@ -433,7 +467,8 @@ enum ihk_mc_pt_attribute vrflag_to_ptattr(unsigned long flag) int add_process_memory_range(struct process *process, unsigned long start, unsigned long end, - unsigned long phys, unsigned long flag) + unsigned long phys, unsigned long flag, + struct memobj *memobj, off_t offset) { struct vm_range *range; int rc; @@ -458,6 +493,8 @@ int add_process_memory_range(struct process *process, range->start = start; range->end = end; range->flag = flag; + range->memobj = memobj; + range->objoff = offset; if(range->flag & VR_DEMAND_PAGING) { dkprintf("range: 0x%lX - 0x%lX => physicall memory area is allocated on demand (%ld) [%lx]\n", @@ -636,6 +673,321 @@ out: return error; } +static int pf_anon_page_not_present(struct process *proc, struct vm_range *range, uintptr_t fault_addr) +{ + int error; + int npages; + void *virt = NULL; + void *ptepgaddr; + size_t ptepgsize; + void *pgaddr; + size_t pgsize; + int p2align; + uintptr_t phys; + enum ihk_mc_pt_attribute attr; + size_t maxpgsize; + pte_t *ptep; + + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx)\n", proc, range->start, range->end, range->flag, fault_addr); + + ihk_mc_spinlock_lock_noirq(&proc->vm->page_table_lock); + error = ihk_mc_pt_lookup_pte(proc->vm->page_table, (void *)fault_addr, &ptep, &ptepgaddr, &ptepgsize); + if (error && (error != -ENOENT)) { + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):lookup pte failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + if (!error && (*ptep != PTE_NULL)) { + if (!(*ptep & PF_PRESENT)) { + error = -EFAULT; + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):disabled page. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + error = 0; + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):already mapped. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + flush_tlb(); + goto out; + } + + if (error) { + error = 0; + ptepgsize = LARGE_PAGE_SIZE; + ptepgaddr = (void *)-1; + } + maxpgsize = ptepgsize; +#ifndef USE_LARGE_PAGES + if (maxpgsize > PAGE_SIZE) { + maxpgsize = PAGE_SIZE; + } +#endif + for (;;) { + error = ihk_mc_pt_choose_pagesize(proc->vm->page_table, (void *)range->start, (void *)range->end, (void *)fault_addr, maxpgsize, &pgaddr, &pgsize, &p2align); + if (error) { + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):choose pagesize failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + npages = pgsize / PAGE_SIZE; + virt = ihk_mc_alloc_aligned_pages(npages, p2align, IHK_MC_AP_NOWAIT); + if (virt) { + phys = virt_to_phys(virt); + memset(virt, 0, pgsize); + break; + } + + if (pgsize <= PAGE_SIZE) { + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):alloc pages failed\n", proc, range->start, range->end, range->flag, fault_addr); + error = -ENOMEM; + goto out; + } + + maxpgsize = pgsize - 1; + } + + attr = vrflag_to_ptattr(range->flag); + if ((ptepgaddr == pgaddr) && (ptepgsize == pgsize)) { +kprintf("HIT\n"); + error = ihk_mc_pt_set_pte(proc->vm->page_table, ptep, phys, pgsize, attr); + if (error) { + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):set pte failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + } + else { + error = ihk_mc_pt_set_range(proc->vm->page_table, pgaddr, pgaddr+pgsize, phys, attr); + if (error) { + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx):set range failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + } + virt = NULL; + + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&proc->vm->page_table_lock); + if (virt != NULL) { + ihk_mc_free_pages(virt, npages); + } + kprintf("pf_anon_page_not_present(%p,%lx-%lx %lx,%lx): %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + return error; +} + +static int pf_obj_page_not_present(struct process *proc, struct vm_range *range, uintptr_t fault_addr) +{ + int error; + int npages; + struct page *page = NULL; + void *pgaddr; + size_t pgsize; + int p2align; + uintptr_t phys; + enum ihk_mc_pt_attribute attr; + size_t maxpgsize; + off_t off; + pte_t *ptep; + + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx)\n", proc, range->start, range->end, range->flag, fault_addr); + + ihk_mc_spinlock_lock_noirq(&proc->vm->page_table_lock); + error = ihk_mc_pt_lookup_pte(proc->vm->page_table, (void *)fault_addr, &ptep, &pgaddr, &pgsize); + if (error == -ENOENT) { + maxpgsize = LARGE_PAGE_SIZE; + } + else if (error) { + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):lookup pte failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + else if (*ptep != PTE_NULL) { + if (!*ptep & PF_PRESENT) { + error = -EFAULT; + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):disabled page. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):already mapped. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + flush_tlb(); + error = 0; + goto out; + } + else { + maxpgsize = pgsize; + } + +#ifndef USE_LARGE_PAGES + maxpgsize = PAGE_SIZE; +#else + /* temporary? restriction */ + maxpgsize = PAGE_SIZE; +#endif + do { + error = ihk_mc_pt_choose_pagesize(proc->vm->page_table, (void *)range->start, (void *)range->end, (void *)fault_addr, maxpgsize, &pgaddr, &pgsize, &p2align); + if (error) { + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):choose pagesize failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + off = range->objoff + ((uintptr_t)pgaddr - range->start); + error = memobj_get_page(range->memobj, off, pgsize, &phys); + if (error) { + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):get page failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + + } + npages = pgsize / PAGE_SIZE; + page = phys_to_page(phys); + } while (0); + + attr = vrflag_to_ptattr(range->flag); + if ((range->flag & VR_PRIVATE) && (range->flag & VR_PROT_WRITE)) { + /* for copy-on-write */ + attr &= ~PTATTR_WRITABLE; + } + + error = ihk_mc_pt_set_range(proc->vm->page_table, pgaddr, pgaddr+pgsize, phys, attr); + if (error) { + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx):set range failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + error = 0; + page = NULL; /* avoid page_unmap() */ + +out: + if ((page != NULL) && page_unmap(page)) { + ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages); + } + ihk_mc_spinlock_unlock_noirq(&proc->vm->page_table_lock); + kprintf("pf_obj_page_not_present(%p,%lx-%lx %lx,%lx): %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + return error; +} + +static int pf_obj_cow_page(struct process *proc, struct vm_range *range, uintptr_t fault_addr) +{ + int error; + pte_t *ptep; + void *pgaddr; + size_t pgsize; + uintptr_t oldpa; + void *oldva; + void *newva; + uintptr_t newpa; + struct page *oldpage; + enum ihk_mc_pt_attribute attr; + + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx)\n", proc, range->start, range->end, range->flag, fault_addr); + + ihk_mc_spinlock_lock_noirq(&proc->vm->page_table_lock); + error = ihk_mc_pt_lookup_pte(proc->vm->page_table, (void *)fault_addr, &ptep, &pgaddr, &pgsize); + if (error) { + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx):pte not found. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + flush_tlb(); + error = 0; + goto out; + } + if (pgsize != PAGE_SIZE) { + panic("pf_obj_cow_page:NYI:cow large page"); + } + + oldpa = *ptep & PT_PHYSMASK; + oldva = phys_to_virt(oldpa); + oldpage = phys_to_page(oldpa); + + if (oldpage) { + newva = NULL; + ihk_mc_spinlock_lock_noirq(&range->memobj->page_list_lock); + for (;;) { + if (oldpage->mode != PM_MAPPED) { + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx):invalid cow page. %p %x\n", proc, range->start, range->end, range->flag, fault_addr, range->memobj, oldpage->mode); + panic("page_fault_process_meory_range:invalid cow page"); + } + if (oldpage->count == 1) { + if (newva) { + ihk_mc_free_pages(newva, 1); + } + list_del(&oldpage->list); + oldpage->mode = PM_NONE; + newpa = oldpa; + newva = oldva; + break; + } + if (oldpage->count <= 0) { + panic("pf_obj_cow_page:oldpage count corrupted"); + } + if (newva) { + memcpy(newva, oldva, pgsize); + --oldpage->count; + break; + } + ihk_mc_spinlock_unlock_noirq(&range->memobj->page_list_lock); + newva = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); + if (!newva) { + error = -ENOMEM; + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx):alloc page failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + ihk_mc_spinlock_lock_noirq(&range->memobj->page_list_lock); + } + ihk_mc_spinlock_unlock_noirq(&range->memobj->page_list_lock); + } + else { + newva = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT); + if (newva == NULL) { + error = -ENOMEM; + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx):alloc page failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + goto out; + } + + memcpy(newva, oldva, pgsize); + } + + newpa = virt_to_phys(newva); + attr = vrflag_to_ptattr(range->flag); + error = ihk_mc_pt_set_pte(proc->vm->page_table, ptep, newpa, pgsize, attr); + if (error) { + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx):set pte failed. %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + ihk_mc_free_pages(newva, 1); + goto out; + } + + error = 0; +out: + ihk_mc_spinlock_unlock_noirq(&proc->vm->page_table_lock); + kprintf("pf_obj_cow_page(%p,%lx-%lx %lx,%lx): %d\n", proc, range->start, range->end, range->flag, fault_addr, error); + return error; +} + +int page_fault_process_memory_range(struct process *proc, + struct vm_range *range, uintptr_t fault_addr, uint64_t reason) +{ + int error; + + kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx)\n", + proc, range->start, range->end, range->flag, + fault_addr, reason); + + if (!(reason & PF_PROT) && !range->memobj) { + error = pf_anon_page_not_present(proc, range, fault_addr); + } + else if (!(reason & PF_PROT) && range->memobj) { + error = pf_obj_page_not_present(proc, range, fault_addr); + } + else if ((reason & PF_PROT) && (reason & PF_WRITE) && (range->flag & VR_PROT_WRITE) && range->memobj) { + error = pf_obj_cow_page(proc, range, fault_addr); + } + else { + error = -EFAULT; + kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):" + "unknown fault. %d\n", + proc, range->start, range->end, range->flag, + fault_addr, reason, error); + } + + kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx): %d\n", + proc, range->start, range->end, range->flag, + fault_addr, reason, error); + return error; +} + int init_process_stack(struct process *process, struct program_load_desc *pn, int argc, char **argv, int envc, char **env) @@ -648,14 +1000,18 @@ int init_process_stack(struct process *process, struct program_load_desc *pn, unsigned long end = process->vm->region.user_end; unsigned long start = end - size; int rc; + unsigned long vrflag; if(stack == NULL) return -ENOMEM; memset(stack, 0, size); + vrflag = VR_STACK; + vrflag |= VR_PROT_READ | VR_PROT_WRITE | VR_PROT_EXEC; + vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag); if ((rc = add_process_memory_range(process, start, end, virt_to_phys(stack), - VR_STACK|VR_PROT_READ|VR_PROT_WRITE)) != 0) { + vrflag, NULL, 0)) != 0) { ihk_mc_free_pages(stack, USER_STACK_NR_PAGES); return rc; } @@ -783,7 +1139,7 @@ unsigned long extend_process_region(struct process *proc, } } if((rc = add_process_memory_range(proc, aligned_end, aligned_new_end, - (p==0?0:virt_to_phys(p)), flag)) != 0){ + (p==0?0:virt_to_phys(p)), flag, NULL, 0)) != 0){ free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT); return end; } @@ -808,6 +1164,24 @@ int remove_process_region(struct process *proc, return 0; } +void flush_process_memory(struct process *proc) +{ + struct process_vm *vm = proc->vm; + struct vm_range *range; + + kprintf("flush_process_memory(%p)\n", proc); + ihk_mc_spinlock_lock_noirq(&vm->memory_range_lock); + list_for_each_entry(range, &vm->vm_range_list, list) { + if (range->memobj != NULL) { + memobj_release(range->memobj); + range->memobj = NULL; + } + } + ihk_mc_spinlock_unlock_noirq(&vm->memory_range_lock); + kprintf("flush_process_memory(%p):\n", proc); + return; +} + void free_process_memory(struct process *proc) { struct vm_range *range, *next; diff --git a/kernel/syscall.c b/kernel/syscall.c index 68b2d5cb..f6c31864 100644 --- a/kernel/syscall.c +++ b/kernel/syscall.c @@ -119,10 +119,36 @@ int do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx) ihk_mc_get_processor_id(), req->number); - while (!res->status) { - cpu_pause(); - } +#define STATUS_IN_PROGRESS 0 +#define STATUS_COMPLETED 1 +#define STATUS_PAGE_FAULT 3 + while (res->status != STATUS_COMPLETED) { + while (res->status == STATUS_IN_PROGRESS) { + cpu_pause(); + } + if (res->status == STATUS_PAGE_FAULT) { + volatile struct syscall_request *req = cpu_local_var(scp).request_va; + int error; + uint8_t u8; + + /* do page fault */ + u8 = *(volatile uint8_t *)res->fault_address; // XXX: + if (res->fault_reason) { + *(uint8_t *)res->fault_address = u8; // XXX: + } + error = 0; + + /* send result */ + req->number = __NR_mmap; + req->args[0] = 0x101; + req->args[1] = error; + + res->status = STATUS_IN_PROGRESS; + req->valid = 1; + } + } + dkprintf("SC(%d)[%3d] got host reply: %d \n", ihk_mc_get_processor_id(), req->number, res->ret); @@ -162,6 +188,7 @@ terminate(int rc, int sig, ihk_mc_user_context_t *ctx) /* XXX: send SIGKILL to all threads in this process */ + flush_process_memory(proc); /* temporary hack */ do_syscall(&request, ctx); #define IS_DETACHED_PROCESS(proc) (1) /* should be implemented in the future */ @@ -309,6 +336,9 @@ SYSCALL_DECLARE(mmap) void *p; int vrflags; intptr_t phys; + struct memobj *memobj; + int maxprot; + int denied; dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n", ihk_mc_get_processor_id(), @@ -388,6 +418,7 @@ SYSCALL_DECLARE(mmap) /* do the map */ vrflags = VR_NONE; vrflags |= PROT_TO_VR_FLAG(prot); + vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0; if (flags & MAP_ANONYMOUS) { if (0) { /* dummy */ @@ -401,11 +432,28 @@ SYSCALL_DECLARE(mmap) else if ((len == 64*1024*1024) || (len == 128*1024*1024)) { vrflags |= VR_DEMAND_PAGING; } +#if 1 + vrflags |= VR_DEMAND_PAGING; +#endif + } + else { + /* mapped file */ + vrflags |= VR_DEMAND_PAGING; } p = NULL; phys = 0; - if (!(vrflags & VR_DEMAND_PAGING) + memobj = NULL; + maxprot = PROT_READ | PROT_WRITE | PROT_EXEC; + if (!(flags & MAP_ANONYMOUS)) { + error = memobj_create(fd, flags, prot, &memobj, &maxprot); + if (error) { + ekprintf("sys_mmap:memobj_create failed. %d\n", error); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + goto out; + } + } + else if (!(vrflags & VR_DEMAND_PAGING) && ((vrflags & VR_PROT_MASK) != VR_PROT_NONE)) { npages = len >> PAGE_SHIFT; p2align = PAGE_P2ALIGN; @@ -426,7 +474,22 @@ SYSCALL_DECLARE(mmap) phys = virt_to_phys(p); } - error = add_process_memory_range(proc, addr, addr+len, phys, vrflags); + if ((flags & MAP_PRIVATE) && (maxprot & PROT_READ)) { + maxprot = PROT_READ | PROT_WRITE | PROT_EXEC; + } + denied = prot & ~maxprot; + if (denied) { + ekprintf("sys_mmap:denied %x. %x %x\n", denied, prot, maxprot); + ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); + if (p != NULL) { + ihk_mc_free_pages(p, npages); + } + error = -EACCES; + goto out; + } + vrflags |= VRFLAG_PROT_TO_MAXPROT(PROT_TO_VR_FLAG(maxprot)); + + error = add_process_memory_range(proc, addr, addr+len, phys, vrflags, memobj, off); if (error) { ekprintf("sys_mmap:add_process_memory_range" "(%p,%lx,%lx,%lx,%lx) failed %d\n", @@ -440,32 +503,6 @@ SYSCALL_DECLARE(mmap) } ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock); - - /* read page with pread64() */ - if (!(flags & MAP_ANONYMOUS)) { - ihk_mc_user_context_t ctx2; - ssize_t ss; - - ihk_mc_syscall_arg0(&ctx2) = fd; - ihk_mc_syscall_arg1(&ctx2) = addr; - ihk_mc_syscall_arg2(&ctx2) = len; - ihk_mc_syscall_arg3(&ctx2) = off; - - ss = syscall_generic_forwarding(__NR_pread64, &ctx2); - if (ss < 0) { - ekprintf("sys_mmap:pread(%d,%lx,%lx,%lx) failed %ld\n", - fd, addr, len, off, (long)ss); - error = do_munmap((void *)addr, len); - if (error) { - ekprintf("sys_mmap:do_munmap(%lx,%lx) failed. %d\n", - addr, len, error); - /* through */ - } - error = ss; - goto out; - } - } - error = 0; out: dkprintf("[%d]sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n", @@ -507,6 +544,7 @@ SYSCALL_DECLARE(mprotect) int error; struct vm_range *changed; const unsigned long protflags = PROT_TO_VR_FLAG(prot); + unsigned long denied; dkprintf("[%d]sys_mprotect(%lx,%lx,%x)\n", ihk_mc_get_processor_id(), start, len0, prot); @@ -558,6 +596,14 @@ SYSCALL_DECLARE(mprotect) error = -EINVAL; goto out; } + + denied = protflags & ~VRFLAG_MAXPROT_TO_PROT(range->flag); + if (denied) { + ekprintf("sys_mprotect(%lx,%lx,%x):denied %lx. %lx %lx\n", + start, len0, prot, denied, protflags, range->flag); + error = -EACCES; + goto out; + } } /* do the mprotect */ @@ -629,6 +675,7 @@ SYSCALL_DECLARE(brk) unsigned long address = ihk_mc_syscall_arg0(ctx); struct vm_regions *region = &cpu_local_var(current)->vm->region; unsigned long r; + unsigned long vrflag; dkprintf("SC(%d)[sys_brk] brk_start=%lx,end=%lx\n", ihk_mc_get_processor_id(), region->brk_start, region->brk_end); @@ -646,6 +693,8 @@ SYSCALL_DECLARE(brk) } /* try to extend memory region */ + vrflag = VR_PROT_READ | VR_PROT_WRITE; + vrflag |= VRFLAG_PROT_TO_MAXPROT(vrflag); ihk_mc_spinlock_lock_noirq(&cpu_local_var(current)->vm->memory_range_lock); region->brk_end = extend_process_region(cpu_local_var(current), region->brk_start, region->brk_end, address, diff --git a/lib/include/ihk/mm.h b/lib/include/ihk/mm.h index fee5f442..3c6e140e 100644 --- a/lib/include/ihk/mm.h +++ b/lib/include/ihk/mm.h @@ -1,6 +1,7 @@ #ifndef __HEADER_GENERIC_IHK_MM_H #define __HEADER_GENERIC_IHK_MM_H +#include #include enum ihk_mc_gma_type { @@ -58,7 +59,7 @@ struct ihk_mc_pa_ops { }; void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *); -void ihk_mc_set_page_fault_handler(void (*h)(unsigned long, void *, unsigned long)); +void ihk_mc_set_page_fault_handler(void (*h)(unsigned long, unsigned long, void *)); unsigned long ihk_mc_map_memory(void *os, unsigned long phys, unsigned long size); @@ -100,6 +101,13 @@ int ihk_mc_pt_change_attr_range(page_table_t pt, void *start, void *end, enum ihk_mc_pt_attribute setattr); int ihk_mc_pt_alloc_range(page_table_t pt, void *start, void *end, enum ihk_mc_pt_attribute attr); +int ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, pte_t **ptepp, void **pgbasep, size_t *pgsizep); +int ihk_mc_pt_choose_pagesize(page_table_t pt, void *start, void *end, + void *fault_addr, size_t maxpgsize, void **pgbasep, + size_t *pgsizep, int *p2alignp); +int ihk_mc_pt_set_range(page_table_t pt, void *start, void *end, + uintptr_t phys, enum ihk_mc_pt_attribute attr); +int ihk_mc_pt_set_pte(page_table_t pt, pte_t *ptep, uintptr_t phys, size_t pgsize, enum ihk_mc_pt_attribute attr); int ihk_mc_pt_prepare_map(page_table_t pt, void *virt, unsigned long size, enum ihk_mc_pt_prepare_flag);