Compare commits

...

78 Commits
1.1.0 ... 1.1.2

Author SHA1 Message Date
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
f74dcfc2a1 Modify mcreboot.sh for job scheduler
1. Don't complain when logname command doesn't exist
2016-09-01 19:27:18 +09:00
7c562d0539 support madvise(MADV_DONTFORK) 2016-09-01 11:22:53 +09:00
b5e4459a34 support AVX-512 registers 2016-08-30 18:39:33 +09:00
782122b681 mcctrl: fix to rus_vm_fault() call by kworker process 2016-08-22 13:00:28 +09:00
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
1ce1b17a85 Specify facility used by mcklogd via option
1. You can specify facility through -f option of mcreboot.sh.
   Example:
   mcreboot.sh -k 1 -f LOG_LOCAL6
   Note that you need to specify "-k 1" or "-k 2" to start mcklogd.
2. Kill mcklogd if needed in mcreboot.sh and mcstop+release.sh.
2016-08-17 17:52:44 +09:00
a2456c3ed2 Modify mcstop+release.sh for job scheduler
1. Remove ihk.ko
2. Output message to stderr and return one on error
2016-08-17 17:32:06 +09:00
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
15783f09a0 Modify mcreboot.sh for job scheduler
1. Add an option to specify owner of device files
2. Output message to stderr and return one on error
2016-08-17 15:07:13 +09:00
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
dac6f2883e mcctrl procfs: use semaphores instead of spinlocks to avoid sleeping in GFP_KERNEL kmalloc() in atomic context 2016-08-01 20:33:51 +09:00
c484f766fa schedule(): schedule a sleeping processes if it has pending signals 2016-07-28 11:42:00 +09:00
57690479bd read/patch_process_vm(): map non-LWK physical addresses properly 2016-07-22 20:48:54 +09:00
d0539a9cac eclair: make idle threads visible 2016-07-22 18:06:11 +09:00
4c8f583c0c split_large_page(): avoid panic when splitting "non-mapped" large pages 2016-07-14 17:11:52 +09:00
6118faffa9 pager_req_pfn(): use FAULT_FLAG_USER only if defined 2016-07-13 18:05:31 +09:00
dad6470c60 clone_thread: fork(2) copy sigstack infos from parent 2016-07-13 16:15:01 +09:00
46c37fc8f3 setfsgid: fix to didn't change fsgid 2016-07-13 15:54:52 +09:00
f6908f21a8 do_kill: wake PS_INTERRUPTIBLE process when send SIGKILL
sched_wakeup_thread: don't change process status if process status is PS_EXITED
2016-07-13 14:06:32 +09:00
01d9d9a5ba devobj: allow arbitrary size device file mappings 2016-07-12 17:02:19 +09:00
c43d993a4d mcstop+release-smp-x86.sh.in: unload mcctrl after OS shutdown 2016-07-11 16:40:06 +09:00
7d9bbecd7a mcctrl: use IHK OS notifiers to establish/tear down syscall channels
This patch eliminates the need for rmmod/insmod the mcctrl module
every time an OS instance is rebooted.
2016-07-11 16:22:50 +09:00
d135731398 do_syscall(): allow schedule for another thread (Intel MPI+OpenMP issue) 2016-07-05 18:54:51 +09:00
5c190beb04 save fpregs when to call sighandler
refs #50
2016-07-05 15:26:00 +09:00
fc66556f9f mcexec: error handling and propagation 2016-06-24 15:35:38 -07:00
648bacc90f device file mappings: communicate map flags and fault missing translations 2016-06-24 12:44:59 -07:00
dd37443fc7 PAPI support: performance counter's overflow.
and support mckfd fcntl.
2016-06-24 13:50:12 +09:00
e34322702a x86_init_perfctr: discover perf counters dynamically from MSRs 2016-06-22 10:47:57 -07:00
e12997e6a9 mcreboot: support for CPU cores (-c) and memory (-m) arguments 2016-06-21 09:10:06 -07:00
fabaa806d3 Revert "Make executor code include executer/config.h": breaks out-of-tree compile
This reverts commit d90900b6e6.
2016-06-21 08:51:45 +09:00
a83ad620c8 devobj: allow read only device file mappings (OFED 3.3 support) 2016-06-21 06:57:59 +09:00
60 changed files with 6693 additions and 1612 deletions

View File

@ -1,5 +1,6 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@ -48,6 +49,9 @@ install::
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \

View File

@ -181,6 +181,8 @@ static void init_idt(void)
}
static int xsave_available = 0;
static int xsave_size = 0;
static uint64_t xsave_mask = 0x0;
void init_fpu(void)
{
@ -224,6 +226,26 @@ void init_fpu(void)
xsetbv(0, reg);
dkprintf("init_fpu(): AVX init: XCR0 = 0x%016lX\n", reg);
}
if(xsave_available){
unsigned long eax;
unsigned long ebx;
unsigned long ecx;
unsigned long edx;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (0x0d), "c" (0x00));
xsave_size = ecx;
dkprintf("init_fpu(): xsave_size = %d\n", xsave_size);
if ((eax & (1 << 5)) && (eax & (1 << 6)) && (eax & (1 << 7))) {
/* Set xcr0[7:5] to enable avx-512 ops */
reg = xgetbv(0);
reg |= 0xe6;
xsetbv(0, reg);
dkprintf("init_fpu(): AVX-512 init: XCR0 = 0x%016lX\n", reg);
}
}
xsave_mask = xgetbv(0);
dkprintf("init_fpu(): xsave_mask = 0x%016lX\n", xsave_mask);
/* TODO: set MSR_IA32_XSS to enable xsaves/xrstors */
@ -234,6 +256,17 @@ void init_fpu(void)
asm volatile("finit");
}
int
get_xsave_size()
{
return xsave_size;
}
uint64_t get_xsave_mask()
{
return xsave_mask;
}
void reload_gdt(struct x86_desc_ptr *gdt_ptr)
{
asm volatile("pushq %1\n"
@ -883,13 +916,36 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
}
else if (vector == LOCAL_PERF_VECTOR) {
struct siginfo info;
unsigned long value;
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
long irqstate;
struct mckfd *fdp;
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
value = rdmsr(MSR_PERF_GLOBAL_STATUS);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0);
//TODO: counter overflow signal
//set_signal(0x1d, regs, NULL); // SIGIO
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
for(fdp = proc->mckfd; fdp; fdp = fdp->next) {
if(fdp->sig_no > 0)
break;
}
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
if(fdp) {
memset(&info, '\0', sizeof info);
info.si_signo = fdp->sig_no;
info._sifields._sigfault.si_addr = (void *)regs->gpr.rip;
info._sifields._sigpoll.si_fd = fdp->fd;
set_signal(fdp->sig_no, regs, &info);
}
else {
set_signal(SIGIO, regs, NULL);
}
}
else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START &&
vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) {
@ -998,9 +1054,8 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
irqflags = kprintf_lock();
dkprintf("[%d] Page fault for 0x%lX\n",
ihk_mc_get_processor_id(), address);
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
__kprintf("Page fault for 0x%lx\n", address);
__kprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
@ -1012,14 +1067,14 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
dkprintf("address is in range, flag: 0x%X! \n",
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
dkprintf("address is out of range! \n");
__kprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
@ -1494,7 +1549,8 @@ release_fp_regs(struct thread *thread)
if (thread && !thread->fp_regs)
return;
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("release_fp_regs: pages=%d\n", pages);
ihk_mc_free_pages(thread->fp_regs, pages);
thread->fp_regs = NULL;
}
@ -1508,7 +1564,8 @@ save_fp_regs(struct thread *thread)
int pages;
if (!thread->fp_regs) {
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("save_fp_regs: pages=%d\n", pages);
thread->fp_regs = ihk_mc_alloc_pages(pages, IHK_MC_AP_NOWAIT);
if (!thread->fp_regs) {
@ -1517,14 +1574,15 @@ save_fp_regs(struct thread *thread)
}
memset(thread->fp_regs, 0, sizeof(fp_regs_struct));
memset(thread->fp_regs, 0, pages * PAGE_SIZE);
}
if (xsave_available) {
unsigned int low, high;
/* Request full save of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full save of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xsave %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high)
: "memory");
@ -1546,9 +1604,9 @@ restore_fp_regs(struct thread *thread)
if (xsave_available) {
unsigned int low, high;
/* Request full restore of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full restore of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xrstor %0" : : "m" (*thread->fp_regs),
"a" (low), "d" (high));

View File

@ -318,5 +318,5 @@ extern unsigned long ap_trampoline;
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif

View File

@ -66,7 +66,7 @@ SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_HANDLED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_HANDLED(96, gettimeofday)

View File

@ -23,6 +23,7 @@
#include <process.h>
#include <page.h>
#include <cls.h>
#include <kmalloc.h>
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
@ -84,20 +85,22 @@ void ihk_mc_free_pages(void *p, int npages)
pa_ops->free_page(p, npages);
}
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag)
void *ihk_mc_allocate(int size, int flag)
{
if (pa_ops && pa_ops->alloc)
return pa_ops->alloc(size, flag);
else
return ihk_mc_alloc_pages(1, flag);
if (!cpu_local_var(kmalloc_initialized)) {
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
return NULL;
}
return kmalloc(size, IHK_MC_AP_NOWAIT);
}
void ihk_mc_free(void *p)
{
if (pa_ops && pa_ops->free)
return pa_ops->free(p);
else
return ihk_mc_free_pages(p, 1);
if (!cpu_local_var(kmalloc_initialized)) {
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
return;
}
kfree(p);
}
void *get_last_early_heap(void)
@ -910,11 +913,17 @@ static int split_large_page(pte_t *ptep, size_t pgsize)
*ptep = (virt_to_phys(pt) & PT_PHYSMASK) | PFL2_PDIR_ATTR;
if (phys_base != NOPHYS) {
page = phys_to_page(phys_base);
if (page && page_unmap(page)) {
kprintf("split_large_page:page_unmap:%p\n", page);
panic("split_large_page:page_unmap\n");
/* Do not do this check for large pages as they don't come from the zeroobj
* and are not actually mapped.
* TODO: clean up zeroobj as we don't really need it, anonymous mappings
* should be allocated for real */
if (pgsize != PTL2_SIZE) {
if (phys_base != NOPHYS) {
page = phys_to_page(phys_base);
if (pgsize != PTL2_SIZE && page && page_unmap(page)) {
kprintf("split_large_page:page_unmap:%p\n", page);
panic("split_large_page:page_unmap\n");
}
}
}
return 0;
@ -1105,6 +1114,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL1_SIZE;
}
@ -1153,6 +1163,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL2_SIZE;
}
@ -2261,13 +2272,18 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
if ((ustart < vm->region.user_start)
|| (vm->region.user_end <= ustart)
|| ((vm->region.user_end - ustart) < siz)) {
kprintf("%s: error: out of user range\n", __FUNCTION__);
return -EFAULT;
}
reason = PF_USER; /* page not present */
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
if (!addr)
return -EINVAL;
error = page_fault_process_vm(vm, (void *)addr, reason);
if (error) {
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
return error;
}
}
@ -2283,11 +2299,22 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, from, &pa);
if (error) {
kprintf("%s: error: resolving physical address or %p\n", __FUNCTION__, from);
return error;
}
va = phys_to_virt(pa);
memcpy(to, va, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, to: %p, pa: %p,"
"cpsize: %d\n", __FUNCTION__, to, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(to, va, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(to, va, cpsize);
}
from += cpsize;
to += cpsize;
@ -2413,8 +2440,18 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
return error;
}
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, from: %p,"
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(va, from, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
}
from += cpsize;
to += cpsize;

View File

@ -38,7 +38,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler);
ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel);
/* Set boot parameter */

View File

@ -12,16 +12,29 @@
#include <errno.h>
#include <ihk/debug.h>
#include <registers.h>
#include <mc_perf_event.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
void x86_init_perfctr(void)
{
int i = 0;
unsigned long reg;
unsigned long value = 0;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
/* Do not do it on KVM */
if (running_on_kvm()) return;
@ -30,12 +43,41 @@ void x86_init_perfctr(void)
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Detect number of supported performance counters */
if (!perf_counters_discovered) {
/* See Table 35.2 - Architectural MSRs in Vol 3C */
op = 0x0a;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
X86_IA32_FIXED_PERF_COUNTERS_MASK =
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
X86_IA32_BASE_FIXED_PERF_COUNTERS;
perf_counters_discovered = 1;
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
}
/* Clear Fixed Counter Control */
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= 0xfffffffffffff000L;
wrmsr(MSR_PERF_FIXED_CTRL, value);
/* Clear Generic Counter Control */
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -63,12 +105,12 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, unsigned long val)
static int set_pmc_x86_direct(int counter, long val)
{
unsigned long cnt_bit = 0;
@ -76,6 +118,8 @@ static int set_pmc_x86_direct(int counter, unsigned long val)
return -EINVAL;
}
val &= 0x000000ffffffffff; // 40bit Mask
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
@ -102,7 +146,7 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
@ -183,6 +227,24 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
if(counter_mask >> 32 & 0x1) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x2) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 4);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x4) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
}
@ -190,7 +252,7 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
@ -210,6 +272,9 @@ int ihk_mc_perfctr_fixed_init(int counter, int mode)
set_val |= 1;
}
// enable PMI on overflow
set_val |= 1 << 3;
set_val <<= counter_idx * 4;
value |= set_val;
@ -223,7 +288,7 @@ int ihk_mc_perfctr_reset(int counter)
return set_pmc_x86_direct(counter, 0);
}
int ihk_mc_perfctr_set(int counter, unsigned long val)
int ihk_mc_perfctr_set(int counter, long val)
{
return set_pmc_x86_direct(counter, val);
}
@ -297,23 +362,33 @@ unsigned long ihk_mc_perfctr_read_msr(int counter)
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status)
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
{
int ret = -1;
int i = 0;
int ret = -1;
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(*type == PERF_TYPE_HARDWARE) {
switch(*config){
case PERF_COUNT_HW_INSTRUCTIONS :
*type = PERF_TYPE_RAW;
*config = 0x5300c0;
break;
default :
// Unexpected config
return -1;
}
}
else if(*type != PERF_TYPE_RAW) {
return -1;
}
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
pmc_status |= (1 << i);
break;
}
}
if(ret < 0){
return ret;
}
return ret;
return ret;
}

View File

@ -38,6 +38,8 @@ void set_signal(int sig, void *regs0, siginfo_t *info);
void check_signal(unsigned long rc, void *regs0, int num);
extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
extern int get_xsave_size();
extern uint64_t get_xsave_mask();
//#define DEBUG_PRINT_SC
@ -54,6 +56,7 @@ uintptr_t debug_constants[] = {
offsetof(struct cpu_local_var, current),
offsetof(struct cpu_local_var, runq),
offsetof(struct cpu_local_var, status),
offsetof(struct cpu_local_var, idle),
offsetof(struct thread, ctx),
offsetof(struct thread, sched_list),
offsetof(struct thread, proc),
@ -219,6 +222,7 @@ SYSCALL_DECLARE(rt_sigreturn)
struct x86_user_context *regs;
struct sigsp ksigsp;
struct sigsp *sigsp;
int xsavesize = get_xsave_size();
asm ("movq %%gs:(%1),%0"
: "=r"(regs)
@ -265,12 +269,31 @@ SYSCALL_DECLARE(rt_sigreturn)
check_signal(0, regs, 0);
check_need_resched();
}
if(ksigsp.fpregs && xsavesize){
void *fpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
if(fpregs){
uint64_t xsave_mask = get_xsave_mask();
unsigned int low = (unsigned int)xsave_mask;
unsigned int high = (unsigned int)(xsave_mask >> 32);
struct xsave_struct *kfpregs;
kfpregs = (void *)((((unsigned long)fpregs) + 63) & ~63);
if(copy_from_user(kfpregs, ksigsp.fpregs, xsavesize))
return -EFAULT;
asm volatile("xrstor %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
kfree(fpregs);
}
}
return sigsp->sigrc;
}
extern struct cpu_local_var *clv;
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern void interrupt_syscall(int all, int pid);
extern void interrupt_syscall(int pid, int tid);
extern int num_processors;
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
@ -707,6 +730,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
unsigned long *usp; /* user stack */
struct sigsp ksigsp;
struct sigsp *sigsp;
int xsavesize = get_xsave_size();
unsigned long fpregs;
if((k->sa.sa_flags & SA_ONSTACK) &&
!(thread->sigstack.ss_flags & SS_DISABLE) &&
@ -719,7 +744,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
else{
usp = (unsigned long *)regs->gpr.rsp;
}
sigsp = ((struct sigsp *)usp) - 1;
fpregs = (unsigned long)usp - xsavesize;
sigsp = ((struct sigsp *)fpregs) - 1;
sigsp = (struct sigsp *)((unsigned long)sigsp & 0xfffffffffffffff0UL);
memset(&ksigsp, '\0', sizeof ksigsp);
@ -751,6 +777,33 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
ksigsp.restart = isrestart(num, rc, sig, k->sa.sa_flags & SA_RESTART);
if(num != 0 && rc == -EINTR && sig == SIGCHLD)
ksigsp.restart = 1;
if(xsavesize){
uint64_t xsave_mask = get_xsave_mask();
unsigned int low = (unsigned int)xsave_mask;
unsigned int high = (unsigned int)(xsave_mask >> 32);
void *_kfpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
struct xsave_struct *kfpregs;
if(!_kfpregs){
kfree(pending);
kfree(_kfpregs);
kprintf("do_signal,no space available\n");
terminate(0, sig);
return;
}
kfpregs = (void *)((((unsigned long)_kfpregs) + 63) & ~63);
memset(kfpregs, '\0', xsavesize);
asm volatile("xsave %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
if(copy_to_user((void *)fpregs, kfpregs, xsavesize)){
kfree(pending);
kfree(_kfpregs);
kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig);
return;
}
ksigsp.fpregs = (void *)fpregs;
kfree(_kfpregs);
}
memcpy(&ksigsp.info, &pending->info, sizeof(siginfo_t));
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
@ -761,9 +814,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
return;
}
usp = (unsigned long *)sigsp;
usp--;
*usp = (unsigned long)k->sa.sa_restorer;
@ -1240,7 +1290,7 @@ done:
cpu_restore_interrupt(irqstate);
if (doint && !(mask & tthread->sigmask.__val[0])) {
int cpuid = tthread->cpu_id;
int tid = tthread->tid;
int pid = tproc->pid;
int status = tthread->status;
@ -1251,12 +1301,12 @@ done:
}
if(!tthread->proc->nohost)
interrupt_syscall(pid, cpuid);
interrupt_syscall(pid, tid);
if (status != PS_RUNNING) {
if(sig == SIGKILL){
/* Wake up the target only when stopped by ptrace-reporting */
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED);
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
}
else if(sig == SIGCONT || ptracecont == 1){
/* Wake up the target only when stopped by SIGSTOP */
@ -1387,9 +1437,8 @@ SYSCALL_DECLARE(mmap)
goto out;
}
if ((addr < region->user_start)
|| (region->user_end <= addr)
|| ((region->user_end - addr) < len)) {
if ((flags & MAP_FIXED) && ((addr < region->user_start)
|| (region->user_end <= addr))) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
addr0, len0, prot, flags0, fd, off0);
error = -ENOMEM;
@ -1513,6 +1562,7 @@ static int vdso_get_vdso_info(void)
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
dkprintf("vdso_get_vdso_info()\n");
memset(&vdso, '\0', sizeof vdso);
vdso.busy = 1;
vdso.vdso_npages = 0;

View File

@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -13,27 +13,44 @@
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
INTERVAL=1
LOGMODE=0
while getopts :i:k: OPT
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value"
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value"
echo "invalid -i value" >&2
exit 1
fi
;;
@ -41,22 +58,24 @@ do
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value"
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value"
echo "invalid -k value" >&2
exit 1
fi
;;
*) echo "invalid option -${OPT}"
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
mem="512M@0"
cpus=""
ihk_ikc_irq_core=0
release=`uname -r`
@ -66,10 +85,17 @@ patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi`
else
enable_mcoverlay=no
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
if [ "$cpus" == "" ]; then
@ -79,12 +105,7 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
fi
# Remove mcoverlay if loaded
@ -95,13 +116,19 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay"; exit; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
fi
# Load IHK-SMP if not loaded and reserve CPUs and memory
@ -113,57 +140,61 @@ if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86"; exit; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos"; exit; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc"; exit; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko"; exit; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
@ -171,7 +202,7 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc"; exit; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
do
@ -180,7 +211,7 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys"; exit; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
mount --make-rprivate /sys
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
@ -195,5 +226,25 @@ if [ "$enable_mcoverlay" == "yes" ]; then
fi
if [ ${LOGMODE} -ne 0 ]
then
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL}
# mcklogd survives when McKernel isn't shut down by mcstop+release.sh
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -10,6 +10,7 @@
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
@ -17,31 +18,47 @@ mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit; fi
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory" >&2; exit 1; fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if ! rmmod mcctrl; then echo "error: removing mcctrl" >&2; exit 1; fi
fi
# Destroy all LWK instances
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed"; exit; fi
done
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory"; exit; fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86"; exit; fi
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86" >&2; exit 1; fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then echo "error: removing ihk" >&2; exit 1; fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
fi

11
configure vendored
View File

@ -632,6 +632,7 @@ ENABLE_MCOVERLAYFS
MANDIR
KERNDIR
KMODDIR
ETCDIR
SBINDIR
BINDIR
TARGET
@ -3031,6 +3032,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -3882,11 +3886,12 @@ fi
ac_config_headers="$ac_config_headers executer/config.h"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
if test "x$enable_dcfa" = xyes; then :
@ -4590,6 +4595,8 @@ do
"executer/kernel/mcctrl/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/Makefile" ;;
"executer/kernel/mcctrl/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/arch/x86_64/Makefile" ;;
"executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.0.9/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.0.9/Makefile" ;;
"kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
"kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;;
"arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;;
@ -4599,6 +4606,8 @@ do
"arch/x86/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcstop+release-smp-x86.sh" ;;
"arch/x86/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-builtin-x86.sh" ;;
"arch/x86/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ;;
"arch/x86/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.service" ;;
"arch/x86/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.in" ;;
"kernel/Makefile.dcfa") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.dcfa" ;;
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;

View File

@ -146,6 +146,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -278,6 +281,7 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
@ -298,6 +302,8 @@ AC_CONFIG_FILES([
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
@ -307,6 +313,8 @@ AC_CONFIG_FILES([
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
])
AS_IF([test "x$enable_dcfa" = xyes], [

View File

@ -110,6 +110,13 @@ struct program_load_desc {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
@ -128,8 +135,17 @@ struct syscall_load_desc {
unsigned long size;
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;

View File

@ -1,5 +1,5 @@
#include <linux/version.h>
#include "../../../../config.h"
#include "../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
@ -100,8 +100,6 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
memset(vdso, 0, sizeof(*vdso));
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;

View File

@ -255,7 +255,7 @@ void __init binfmt_mcexec_init(void)
insert_binfmt(&mcexec_format);
}
void __exit binfmt_mcexec_exit(void)
void binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

View File

@ -32,6 +32,8 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/version.h>
#include <linux/semaphore.h>
#include <linux/interrupt.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
@ -80,7 +82,6 @@ static long mcexec_prepare_image(ihk_os_t os,
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL;
if (copy_from_user(&desc, udesc,
@ -123,52 +124,48 @@ static long mcexec_prepare_image(ihk_os_t os,
}
pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %ld\n", *(long *)args);
dprintk("args: 0x%lX\n", (unsigned long)pdesc->args);
dprintk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %ld\n", *(long *)envs);
dprintk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
dprintk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu;
isp.arg = virt_to_phys(pdesc);
printk("# of sections: %d\n", pdesc->num_sections);
printk("%p (%lx)\n", pdesc, isp.arg);
dprintk("# of sections: %d\n", pdesc->num_sections);
dprintk("%p (%lx)\n", pdesc, isp.arg);
pdesc->status = 0;
mcctrl_ikc_send(os, pdesc->cpu, &isp);
wait_event_interruptible(usrdata->wq_prepare, pdesc->status);
while (wait_event_interruptible(usrdata->wq_prepare, pdesc->status) != 0);
if(pdesc->err < 0){
ret = pdesc->err;
goto free_out;
}
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC);
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
printk("ERROR: allocating per process data\n");
ret = -ENOMEM;
printk("ERROR: no per process data for PID %d\n", task_tgid_vnr(current));
ret = -EINVAL;
goto free_out;
}
ppd->pid = pdesc->pid;
/* Update rpgtable */
ppd->rpgtable = pdesc->rpgtable;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_add_tail(&ppd->list, &usrdata->per_proc_list);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
ret = -EFAULT;
goto free_out;
}
dprintk("%s: pid %d, rpgtable: 0x%lx added\n",
__FUNCTION__, ppd->pid, ppd->rpgtable);
ret = 0;
free_out:
@ -416,19 +413,200 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus;
}
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg)
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
struct mcctrl_per_proc_data *ppd_iter;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
/* Check if data for this thread exists and add if not */
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ret = -EBUSY;
goto out;
}
}
list_add_tail(&ppd->hash, &ud->per_proc_data_hash[hash]);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
if (!ppd) {
ret = -EINVAL;
goto out;
}
list_del(&ppd->hash);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this process exists and return it */
read_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
read_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ppd;
}
/*
* Called indirectly from the IKC message handler.
*/
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
{
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
struct wait_queue_head_list_node *wqhln_alloc = NULL;
int pid = packet->pid;
unsigned long flags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return 0;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
__FUNCTION__,
packet->req.rtid,
packet->req.ttid,
packet->req.number);
/*
* Three scenarios are possible:
* - Find the designated thread if req->ttid is specified.
* - Find any available thread if req->ttid is zero.
* - Add a request element if no threads are available.
*/
flags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Is this a request for a specific thread? See if it's waiting */
if (unlikely(packet->req.ttid)) {
list_for_each_entry(wqhln_iter, &ppd->wq_list_exact, list) {
if (packet->req.ttid != task_pid_vnr(wqhln_iter->task))
continue;
/* Look up per-process wait queue head with pid */
flags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == pid) {
wqhln = wqhln_iter;
break;
}
if (!wqhln) {
printk("%s: WARNING: no target thread found for exact request??\n",
__FUNCTION__);
}
}
/* Is there any thread available? */
else {
list_for_each_entry(wqhln_iter, &ppd->wq_list, list) {
if (wqhln_iter->task && !wqhln_iter->req) {
wqhln = wqhln_iter;
break;
}
}
}
/* If no match found, add request to pending request list */
if (unlikely(!wqhln)) {
retry_alloc:
wqhln_alloc = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln_alloc) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln = wqhln_alloc;
wqhln->req = 0;
wqhln->task = NULL;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &ppd->wq_req_list);
}
wqhln->packet = packet;
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
return 0;
}
/*
* Called from an mcexec thread via ioctl().
*/
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (packet) {
printk("%s: ERROR: packet %p is already registered for thread %d\n",
__FUNCTION__, packet, task_pid_vnr(current));
return -EBUSY;
}
retry:
/* Prepare per-thread wait queue head or find a valid request */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* First see if there is a valid request already that is not yet taken */
list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) {
if (wqhln_iter->task == NULL && wqhln_iter->req) {
wqhln = wqhln_iter;
wqhln->task = current;
list_del(&wqhln->list);
break;
}
}
if (!wqhln) {
@ -439,180 +617,86 @@ retry_alloc:
goto retry_alloc;
}
wqhln->pid = pid;
wqhln->task = current;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
/* Wait for a request.. */
list_add(&wqhln->list, &ppd->wq_list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
}
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
return 0;
}
#ifndef DO_USER_MODE
// static int remaining_job, base_cpu, job_pos;
#endif
// extern int num_channels;
// extern int mcctrl_dma_abort;
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct syscall_wait_desc swd;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
#ifndef DO_USER_MODE
unsigned long s, w, d;
#endif
//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu));
if (copy_from_user(&swd, req, sizeof(swd))) {
return -EFAULT;
}
if (swd.cpu >= usrdata->num_channels)
return -EINVAL;
c = get_peer_channel(usrdata, current);
if (c) {
printk("mcexec_wait_syscall:already registered. task %p ch %p\n",
current, c);
return -EBUSY;
}
c = usrdata->channels + swd.cpu;
#ifdef DO_USER_MODE
retry:
/* Prepare per-process wait queue head */
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = swd.pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &c->wq_list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
if (ret && !wqhln->req) {
kfree(wqhln);
wqhln = NULL;
return -EINTR;
}
packet = wqhln->packet;
kfree(wqhln);
wqhln = NULL;
if (c->param.request_va->number == 61 &&
c->param.request_va->args[0] == swd.pid) {
dprintk("%s: tid: %d request from CPU %d\n",
__FUNCTION__, task_pid_vnr(current), packet->ref);
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
task_tgid_vnr(current),
task_pid_vnr(current);
c->param.request_va->number,
swd.cpu);
return -EINTR;
}
#if 1
mb();
if (!c->param.request_va->valid) {
printk("mcexec_wait_syscall:stray wakeup\n");
if (!packet->req.valid) {
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
__FUNCTION__,
task_tgid_vnr(current),
task_pid_vnr(current),
packet->req.number);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
goto retry;
}
#endif
#else
while (1) {
c = usrdata->channels + swd.cpu;
ihk_get_tsc(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
ihk_get_tsc(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
}
d = (*c->param.doorbell_va) - 1;
*c->param.doorbell_va = 0;
if (d < 0 || d >= usrdata->num_channels) {
d = 0;
}
usrdata->base_cpu = d;
usrdata->job_pos = 0;
usrdata->remaining_job = 1;
} else {
usrdata->job_pos++;
}
for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) {
if (base_cpu + job_pos >= num_channels) {
c = usrdata->channels +
(usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels);
} else {
c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos;
}
if (!c) {
continue;
}
if (c->param.request_va &&
c->param.request_va->valid) {
#endif
c->param.request_va->valid = 0; /* ack */
dprintk("SC #%lx, %lx\n",
c->param.request_va->number,
c->param.request_va->args[0]);
register_peer_channel(usrdata, current, c);
if (__do_in_kernel_syscall(os, c, c->param.request_va)) {
if (copy_to_user(&req->sr, c->param.request_va,
sizeof(struct syscall_request))) {
deregister_peer_channel(usrdata, current, c);
return -EFAULT;
}
return 0;
}
deregister_peer_channel(usrdata, current, c);
#ifdef DO_USER_MODE
goto retry;
#endif
#ifndef DO_USER_MODE
if (usrdata->mcctrl_dma_abort) {
return -2;
}
}
}
usrdata->remaining_job = 0;
packet->req.valid = 0; /* ack */
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
__FUNCTION__,
packet->req.number,
packet->req.args[0],
packet->req.args[1],
packet->req.args[2],
packet->req.args[3],
packet->req.args[4],
packet->req.args[5]);
if (mcctrl_add_per_thread_data(ppd, current, packet) < 0) {
kprintf("%s: error adding per-thread data\n", __FUNCTION__);
return -EINVAL;
}
#endif
return 0;
if (__do_in_kernel_syscall(os, packet)) {
if (copy_to_user(&req->sr, &packet->req,
sizeof(struct syscall_request))) {
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
return -EFAULT;
}
return 0;
}
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
goto retry;
}
long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg)
@ -695,33 +779,6 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size);
/*
ihk_dma_channel_t channel;
struct ihk_dma_request request;
unsigned long dma_status = 0;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = os;
request.src_phys = desc.src;
request.dest_os = NULL;
request.dest_phys = desc.dest;
request.size = desc.size;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
*/
return 0;
}
@ -729,80 +786,66 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
{
struct syscall_ret_desc ret;
struct mcctrl_channel *mc;
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
#if 0
ihk_dma_channel_t channel;
struct ihk_dma_request request;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
struct mcctrl_per_proc_data *ppd;
if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) {
return -EFAULT;
}
mc = usrdata->channels + ret.cpu;
if (!mc) {
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
deregister_peer_channel(usrdata, current, mc);
mc->param.response_va->ret = ret.ret;
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
kprintf("%s: ERROR: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
return -EINVAL;
}
mcctrl_delete_per_thread_data(ppd, current);
if (ret.size > 0) {
/* Host => Accel. Write is fast. */
unsigned long phys;
void *rpm;
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest,
ret.size);
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, ret.size);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, ret.size);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
ret.size, NULL, 0);
#endif
if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) {
return -EFAULT;
}
mb();
mc->param.response_va->status = 1;
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size);
}
/*
memset(&request, 0, sizeof(request));
request.src_os = NULL;
request.src_phys = ret.src;
request.dest_os = os;
request.dest_phys = ret.dest;
request.size = ret.size;
request.notify_os = os;
request.notify = (void *)mc->param.response_rpa;
request.priv = (void *)1;
ihk_dma_request(channel, &request);
*/
} else {
mb();
mc->param.response_va->status = 1;
}
__return_syscall(os, packet, ret.ret, task_pid_vnr(current));
/* Free packet */
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
return 0;
}
LIST_HEAD(mckernel_exec_files);
DEFINE_SPINLOCK(mckernel_exec_file_lock);
DEFINE_SEMAPHORE(mckernel_exec_file_lock);
struct mckernel_exec_file {
@ -861,14 +904,53 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
int retval;
int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int i;
if (os_ind < 0) {
return EINVAL;
}
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
ppd = kmalloc(sizeof(*ppd), GFP_KERNEL);
if (!ppd) {
printk("ERROR: allocating per process data\n");
return -ENOMEM;
}
ppd->pid = task_tgid_vnr(current);
/*
* XXX: rpgtable will be updated in __do_in_kernel_syscall()
* under case __NR_munmap
*/
INIT_LIST_HEAD(&ppd->wq_list);
INIT_LIST_HEAD(&ppd->wq_req_list);
INIT_LIST_HEAD(&ppd->wq_list_exact);
spin_lock_init(&ppd->wq_list_lock);
for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&ppd->per_thread_data_hash[i]);
rwlock_init(&ppd->per_thread_data_hash_lock[i]);
}
if (mcctrl_add_per_proc_data(usrdata, ppd->pid, ppd) < 0) {
printk("%s: error adding per process data\n", __FUNCTION__);
retval = EINVAL;
goto out_free_ppd;
}
}
else {
/* Only deallocate in case of an error if we added it above */
ppd = NULL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
return ENOMEM;
retval = ENOMEM;
goto out_error_drop_ppd;
}
file = open_exec(filename);
@ -889,7 +971,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
goto out_put_file;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
@ -900,7 +982,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
break;
}
}
/* Add new exec file to the list */
mcef->os = os;
mcef->pid = task_tgid_vnr(current);
@ -910,19 +992,22 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
spin_unlock(&mckernel_exec_file_lock);
up(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current), filename);
kfree(pathbuf);
return 0;
out_put_file:
fput(file);
out_error_free:
kfree(pathbuf);
out_error_drop_ppd:
if (ppd) mcctrl_delete_per_proc_data(usrdata, ppd->pid);
out_free_ppd:
if (ppd) kfree(ppd);
return -retval;
}
@ -932,12 +1017,29 @@ int mcexec_close_exec(ihk_os_t os)
struct mckernel_exec_file *mcef = NULL;
int found = 0;
int os_ind = ihk_host_os_get_index(os);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (ppd) {
mcctrl_delete_per_proc_data(usrdata, ppd->pid);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
task_tgid_vnr(current));
}
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == task_tgid_vnr(current)) {
allow_write_access(mcef->fp);
@ -950,7 +1052,7 @@ int mcexec_close_exec(ihk_os_t os)
}
}
spin_unlock(&mckernel_exec_file_lock);
up(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
}

View File

@ -82,79 +82,109 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
ihk_os_t
osnum_to_os(int n)
ihk_os_t osnum_to_os(int n)
{
return os[n];
}
static int __init mcctrl_init(void)
/* OS event notifier implementation */
int mcctrl_os_boot_notifier(int os_index)
{
int i;
int rc;
rc = -ENOENT;
for(i = 0; i < OS_MAX_MINOR; i++){
os[i] = ihk_host_find_os(i, NULL);
if (os[i]) {
printk("OS #%d found.\n", i);
rc = 0;
}
}
if(rc){
printk("OS not found.\n");
return rc;
os[os_index] = ihk_host_find_os(os_index, NULL);
if (!os[os_index]) {
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
return -EINVAL;
}
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
if (prepare_ikc_channels(os[i]) != 0) {
printk("Preparing syscall channels failed.\n");
os[i] = NULL;
}
}
if (prepare_ikc_channels(os[os_index]) != 0) {
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
os[os_index] = NULL;
return -EFAULT;
}
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
procfs_init(os_index);
printk("mcctrl: OS ID %d boot event handled\n", os_index);
return 0;
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
os[os_index] = NULL;
return rc;
}
int mcctrl_os_shutdown_notifier(int os_index)
{
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
}
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
.boot = mcctrl_os_boot_notifier,
.shutdown = mcctrl_os_shutdown_notifier,
};
static struct ihk_os_notifier mcctrl_os_notifier = {
.ops = &mcctrl_os_notifier_ops,
};
static int __init mcctrl_init(void)
{
int ret = 0;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
rus_page_hash_init();
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
if(rc < 0){
destroy_ikc_channels(os[i]);
os[i] = NULL;
}
procfs_init(i);
}
}
binfmt_mcexec_init();
return 0;
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
}
printk("mcctrl: initialized successfully.\n");
return ret;
error:
binfmt_mcexec_exit();
rus_page_hash_put_pages();
return ret;
}
static void __exit mcctrl_exit(void)
{
int i;
binfmt_mcexec_exit();
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
sysfsm_cleanup(os[i]);
free_topology_info(os[i]);
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
printk("mcctrl: warning: failed to deregister OS notifier??\n");
}
binfmt_mcexec_exit();
rus_page_hash_put_pages();
printk("mcctrl: unregistered.\n");
}
MODULE_LICENSE("GPL v2");

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h>
@ -40,16 +41,18 @@
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) {
switch (msg) {
case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break;
@ -63,7 +66,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
mcexec_syscall(usrdata, pisp);
break;
case SCD_MSG_PROCFS_ANSWER:
@ -88,11 +91,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
@ -110,6 +110,14 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pisp->err, pisp->arg);
break;
}
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0;
}
@ -146,8 +154,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0;
}
@ -193,12 +199,13 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
#endif
pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL,
(void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
@ -218,8 +225,9 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
@ -265,9 +273,6 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
}
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@ -286,9 +291,6 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@ -315,7 +317,7 @@ int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
int error;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
@ -347,17 +349,14 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
}
return 0;
}
@ -396,7 +395,6 @@ void destroy_ikc_channels(ihk_os_t os)
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
destroy_peer_channel_registry(usrdata);
kfree(usrdata->channels);
kfree(usrdata);
}

View File

@ -41,6 +41,7 @@
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include "sysfs.h"
@ -48,6 +49,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@ -110,8 +112,9 @@ struct ikc_scd_packet {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
@ -120,7 +123,13 @@ struct ikc_scd_packet {
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
struct mcctrl_priv {
@ -154,8 +163,11 @@ struct syscall_params {
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
int pid;
struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req;
struct ikc_scd_packet *packet;
};
struct mcctrl_channel {
@ -163,15 +175,30 @@ struct mcctrl_channel {
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
};
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct list_head list;
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
};
struct sysfsm_req {
@ -230,6 +257,10 @@ struct node_topology {
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
@ -245,8 +276,9 @@ struct mcctrl_usrdata {
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
@ -273,12 +305,22 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000
@ -301,6 +343,7 @@ struct procfs_file {
};
void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);

View File

@ -17,8 +17,10 @@
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include <linux/version.h>
#include <linux/semaphore.h>
//#define PROCFS_DEBUG
@ -81,7 +83,7 @@ struct procfs_list_entry {
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
DEFINE_SEMAPHORE(procfs_file_list_lock);
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
@ -375,67 +377,62 @@ _add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
void
add_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
delete_pid_entry(int osnum, int pid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
@ -451,7 +448,7 @@ proc_exe_link(int osnum, int pid, const char *path)
uid, gid, path);
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
/**
@ -463,14 +460,13 @@ void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
/**
@ -481,14 +477,13 @@ procfs_init(int osnum)
void
procfs_exit(int osnum)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
/**
@ -719,6 +714,57 @@ mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,7 @@
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/version.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include "sysfs_msg.h"

View File

@ -1,7 +1,3 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR=@KMODDIR@
src = @abs_srcdir@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
@ -9,31 +5,36 @@ MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi)
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi)
else
ENABLE_BUILD=no
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
ifeq ($(ENABLE_BUILD),yes)
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
endif
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
install:
ifeq ($(ENABLE_BUILD),yes)
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make install)
endif

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -167,6 +167,7 @@ enum {
CURRENT_OFFSET,
RUNQ_OFFSET,
CPU_STATUS_OFFSET,
IDLE_THREAD_OFFSET,
/* process */
CTX_OFFSET,
@ -204,6 +205,7 @@ static int setup_constants(void) {
printf("CURRENT_OFFSET: %ld\n", K(CURRENT_OFFSET));
printf("RUNQ_OFFSET: %ld\n", K(RUNQ_OFFSET));
printf("CPU_STATUS_OFFSET: %ld\n", K(CPU_STATUS_OFFSET));
printf("IDLE_THREAD_OFFSET: %ld\n", K(IDLE_THREAD_OFFSET));
printf("CTX_OFFSET: %ld\n", K(CTX_OFFSET));
printf("SCHED_LIST_OFFSET: %ld\n", K(SCHED_LIST_OFFSET));
printf("PROC_OFFSET: %ld\n", K(PROC_OFFSET));
@ -251,6 +253,64 @@ static int setup_threads(void) {
ihk_mc_switch_context = lookup_symbol("ihk_mc_switch_context");
if (0) printf("ihk_mc_switch_context: %lx\n", ihk_mc_switch_context);
/* Set up idle threads first */
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = v+K(IDLE_THREAD_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = cpu;
ti->lcpu = cpu;
ti->process = thread;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t head;

View File

@ -101,6 +101,19 @@ int __glob_argc = -1;
char **__glob_argv = 0;
#endif
#ifdef ENABLE_MCOVERLAYFS
#undef ENABLE_MCOVERLAYFS
#ifndef RHEL_RELEASE_CODE
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
#endif // RHEL_RELEASE_CODE == 7.2
#endif // RHEL_RELEASE_CODE
#endif // ENABLE_MCOVERLAYFS
typedef unsigned char cc_t;
typedef unsigned int speed_t;
typedef unsigned int tcflag_t;
@ -375,7 +388,7 @@ struct program_load_desc *load_interp(struct program_load_desc *desc0, FILE *fp)
unsigned char *dma_buf;
int lookup_exec_path(char *filename, char *path, int max_len)
int lookup_exec_path(char *filename, char *path, int max_len, int execvp)
{
int found;
int error;
@ -393,28 +406,27 @@ retry:
char *token, *string, *tofree;
char *PATH = getenv("COKERNEL_PATH");
if (!PATH) {
if (!execvp) {
if (strlen(filename) + 1 > max_len) {
return ENAMETOOLONG;
}
strcpy(path, filename);
error = access(path, X_OK);
if (error) {
return errno;
}
found = 1;
break;
}
if (!(PATH = getenv("COKERNEL_PATH"))) {
PATH = getenv("PATH");
}
if (strlen(filename) >= 255) {
return ENAMETOOLONG;
}
/* See first whether file is available in current working dir */
error = access(filename, X_OK);
if (error == 0) {
__dprintf("lookup_exec_path(): found %s in cwd\n", filename);
error = snprintf(path, max_len, "%s", filename);
if (error < 0 || error >= max_len) {
fprintf(stderr, "lookup_exec_path(): array too small?\n");
return ENOMEM;
}
found = 1;
break;
}
__dprintf("PATH: %s\n", PATH);
@ -442,6 +454,9 @@ retry:
}
free(tofree);
if(!found){
return ENOENT;
}
break;
}
@ -654,7 +669,7 @@ int load_elf_desc(char *filename, struct program_load_desc **desc_p,
return 0;
}
void transfer_image(int fd, struct program_load_desc *desc)
int transfer_image(int fd, struct program_load_desc *desc)
{
struct remote_transfer pt;
unsigned long s, e, flen, rpa;
@ -668,7 +683,10 @@ void transfer_image(int fd, struct program_load_desc *desc)
+ PAGE_SIZE - 1) & PAGE_MASK;
rpa = desc->sections[i].remote_pa;
fseek(fp, desc->sections[i].offset, SEEK_SET);
if (fseek(fp, desc->sections[i].offset, SEEK_SET) != 0) {
fprintf(stderr, "transfer_image(): error: seeking file position\n");
return -1;
}
flen = desc->sections[i].filesz;
__dprintf("seeked to %lx | size %ld\n",
@ -690,7 +708,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
if (lr > flen) {
lr = flen;
}
fread(dma_buf + l, 1, lr, fp);
if (fread(dma_buf + l, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
else if (flen > 0) {
@ -699,7 +730,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
} else {
lr = flen;
}
fread(dma_buf, 1, lr, fp);
if (fread(dma_buf, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
s += PAGE_SIZE;
@ -715,6 +759,8 @@ void transfer_image(int fd, struct program_load_desc *desc)
}
}
}
return 0;
}
void print_desc(struct program_load_desc *desc)
@ -837,7 +883,10 @@ struct thread_data_s {
pthread_mutex_t *lock;
pthread_barrier_t *init_ready;
} *thread_data;
int ncpu;
int n_threads;
pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@ -848,7 +897,7 @@ static void *main_loop_thread_func(void *arg)
struct thread_data_s *td = (struct thread_data_s *)arg;
td->tid = gettid();
td->remote_tid = (int)td->tid;
td->remote_tid = -1;
pthread_barrier_wait(&init_ready);
td->ret = main_loop(td->fd, td->cpu, td->lock);
@ -931,7 +980,10 @@ act_signalfd4(struct syscall_wait_desc *w)
flags |= O_NONBLOCK;
if(tmp & SFD_CLOEXEC)
flags |= O_CLOEXEC;
pipe2(sfd->sigpipe, flags);
if (pipe2(sfd->sigpipe, flags) < 0) {
perror("pipe2 failed:");
return -1;
}
sfd->next = sigfdtop;
sigfdtop = sfd;
rc = sfd->sigpipe[0];
@ -962,7 +1014,11 @@ act_signalfd4(struct syscall_wait_desc *w)
rc = -EBADF;
else{
info = (struct signalfd_siginfo *)w->sr.args[2];
write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo));
if (write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo))
!= sizeof(struct signalfd_siginfo)) {
fprintf(stderr, "error: writing sigpipe\n");
rc = -EBADF;
}
}
break;
}
@ -1068,9 +1124,9 @@ void init_worker_threads(int fd)
int i;
pthread_mutex_init(&lock, NULL);
pthread_barrier_init(&init_ready, NULL, ncpu + 2);
pthread_barrier_init(&init_ready, NULL, n_threads + 2);
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
int ret;
thread_data[i].fd = fd;
@ -1091,7 +1147,6 @@ void init_worker_threads(int fd)
}
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define READ_BUFSIZE 1024
static int isunshare(void)
{
@ -1163,7 +1218,6 @@ static int isunshare(void)
__dprintf("err=%d\n", err);
return err;
}
#endif
#endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0
@ -1353,7 +1407,6 @@ int main(int argc, char **argv)
}
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
__dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX];
@ -1401,12 +1454,11 @@ int main(int argc, char **argv)
} else if (error == -1) {
return 1;
}
#endif
#else
__dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) {
if (lookup_exec_path(argv[optind], path, sizeof(path), 1) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1;
}
@ -1418,7 +1470,7 @@ int main(int argc, char **argv)
/* Check whether shell script */
if (shell) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path)) != 0) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path), 0) != 0) {
fprintf(stderr, "error: finding file: %s\n", shell);
return 1;
}
@ -1480,6 +1532,19 @@ int main(int argc, char **argv)
return 1;
}
n_threads = ncpu;
if (ncpu > 16) {
n_threads = 16;
}
/*
* XXX: keep thread_data ncpu sized despite that there are only
* n_threads worker threads in the pool so that signaling code
* keeps working.
*
* TODO: fix signaling code to be independent of TIDs.
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
@ -1522,7 +1587,10 @@ int main(int argc, char **argv)
}
print_desc(desc);
transfer_image(fd, desc);
if (transfer_image(fd, desc) < 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
fflush(stdout);
fflush(stderr);
@ -1561,7 +1629,7 @@ int main(int argc, char **argv)
return 1;
}
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
pthread_join(thread_data[i].thread_id, NULL);
}
@ -1623,16 +1691,14 @@ do_generic_syscall(
}
static void
kill_thread(unsigned long cpu)
kill_thread(unsigned long tid)
{
if(cpu >= 0 && cpu < ncpu){
pthread_kill(thread_data[cpu].thread_id, LOCALSIG);
}
else{
int i;
int i;
for (i = 0; i < ncpu; ++i) {
for (i = 0; i < n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
}
}
}
@ -1738,9 +1804,7 @@ char *
chgpath(char *in, char *buf)
{
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
return in;
#endif
#endif // ENABLE_MCOVERLAYFS
char *fn = in;
struct stat sb;
@ -1791,6 +1855,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
//pthread_mutex_lock(lock);
thread_data[cpu].remote_tid = w.sr.rtid;
switch (w.sr.number) {
case __NR_open:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
@ -1829,13 +1895,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
sig = 0;
term = 0;
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
/* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
}
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){
@ -1903,6 +1969,39 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
thread_data[oldcpuid].remote_tid = wtid;
}
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
*/
if (w.sr.args[4] > 0) {
struct remote_transfer trans;
int i = 0;
int *tids = malloc(sizeof(int) * w.sr.args[4]);
if (!tids) {
fprintf(stderr, "__NR_gettid(): error allocating TIDs\n");
goto gettid_out;
}
for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) {
tids[i] = thread_data[i].tid;
}
for (; i < ncpu; ++i) {
tids[i] = 0;
}
trans.userp = (void*)tids;
trans.rphys = w.sr.args[5];
trans.size = sizeof(int) * w.sr.args[4];
trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE;
if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
}
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
break;
}
@ -1945,7 +2044,9 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
close(pipefds[0]);
pid = fork();
if(pid != 0){
write(pipefds[1], &pid, sizeof pid);
if (write(pipefds[1], &pid, sizeof pid) != sizeof(pid)) {
fprintf(stderr, "error: writing pipefds\n");
}
exit(0);
}
}
@ -1954,7 +2055,9 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
int st;
close(pipefds[1]);
read(pipefds[0], &npid, sizeof npid);
if (read(pipefds[0], &npid, sizeof npid) != sizeof(npid)) {
fprintf(stderr, "error: reading pipefds\n");
}
close(pipefds[0]);
waitpid(pid, &st, 0);
pid = npid;
@ -1994,7 +2097,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
/* Reinit signals and syscall threads */
init_sigaction();
init_worker_threads(fd);
__dprintf("pid(%d): signals and syscall threads OK\n",
getpid());
@ -2008,6 +2110,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
goto fork_child_sync_pipe;
}
init_worker_threads(fd);
fork_child_sync_pipe:
sem_post(&fs->sem);
if (fs->status)
@ -2118,7 +2222,7 @@ fork_err:
shell = NULL;
filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path)))
if ((ret = lookup_exec_path(filename, path, sizeof(path), 0))
!= 0) {
goto return_execve1;
}
@ -2132,7 +2236,7 @@ fork_err:
/* Check whether shell script */
if (shell) {
if ((ret = lookup_exec_path(shell, shell_path,
sizeof(shell_path))) != 0) {
sizeof(shell_path), 0)) != 0) {
fprintf(stderr, "execve(): error: finding file: %s\n", shell);
goto return_execve1;
}
@ -2153,6 +2257,7 @@ fork_err:
strcpy(desc->shell_path, shell_path);
}
desc->enable_vdso = enable_vdso;
__dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n",
path, desc->num_sections);
@ -2210,7 +2315,10 @@ return_execve1:
__dprintf("%s", "execve(): transfer ELF desc OK\n");
transfer_image(fd, desc);
if (transfer_image(fd, desc) != 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
__dprintf("%s", "execve(): image transferred\n");
if (close_cloexec_fds(fd) < 0) {
@ -2262,6 +2370,53 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresuid:
ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setreuid:
ret = setreuid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setuid:
ret = setuid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresgid:
ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setregid:
ret = setregid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setgid:
ret = setgid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setfsgid:
ret = setfsgid(w.sr.args[0]);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_close:
if(w.sr.args[0] == fd)
ret = -EBADF;
@ -2295,7 +2450,9 @@ return_execve2:
break;
}
thread_data[cpu].remote_tid = -1;
//pthread_mutex_unlock(lock);
}
__dprint("timed out.\n");

View File

@ -110,6 +110,7 @@ int __kprintf(const char *format, ...)
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);

View File

@ -78,51 +78,52 @@ static struct memobj *to_memobj(struct devobj *devobj)
/***********************************************************************
* devobj
*/
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags)
{
ihk_mc_user_context_t ctx;
struct pager_map_result result; // XXX: assumes contiguous physical
int error;
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
goto out;
}
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = len;
ihk_mc_syscall_arg3(&ctx) = off;
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
dkprintf("devobj_create:handle: %lx\n", result.handle);
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -140,11 +141,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out:
if (obj) {
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(obj);
}
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
return error;
}
@ -164,6 +166,8 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL;
uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
@ -192,7 +196,7 @@ static void devobj_release(struct memobj *memobj)
}
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(free_obj);
}
@ -204,7 +208,7 @@ static void devobj_release(struct memobj *memobj)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
const off_t pgoff = off / PAGE_SIZE;
struct devobj *obj = to_devobj(memobj);
int error;
uintptr_t pfn;
@ -216,7 +220,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
goto out;
}
ix = pgoff - obj->pfn_pgoff;

View File

@ -182,7 +182,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("fileobj_create(%d):create failed. %d\n", fd, error);
dkprintf("fileobj_create(%d):create failed. %d\n", fd, error);
goto out;
}

View File

@ -79,8 +79,6 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
extern struct sigpending *hassigpending(struct thread *thread);
int futex_cmpxchg_enabled;
/**

View File

@ -332,6 +332,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
goto err;
}
}
else {
vm->vdso_addr = NULL;
}
p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table);
@ -373,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys)
}
n = p->num_sections;
if (n > 16) {
kprintf("%s: ERROR: more ELF sections than 16??\n",
__FUNCTION__);
return -ENOMEM;
}
dkprintf("# of sections: %d\n", n);
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){
if((pn = kmalloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n,
IHK_MC_AP_NOWAIT)) == NULL){
ihk_mc_unmap_virtual(p, npages, 0);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -385,7 +394,7 @@ static int process_msg_prepare_process(unsigned long rphys)
+ sizeof(struct program_image_section) * n);
if((thread = create_thread(p->entry)) == NULL){
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -435,7 +444,7 @@ static int process_msg_prepare_process(unsigned long rphys)
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
vm->address_space->page_table);
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
@ -443,7 +452,7 @@ static int process_msg_prepare_process(unsigned long rphys)
return 0;
err:
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
destroy_thread(thread);
@ -452,7 +461,7 @@ err:
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0;
@ -521,12 +530,7 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
}
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid);
extern void debug_log(long);
@ -561,6 +565,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread;
struct process *proc;
struct mcctrl_signal {
@ -572,22 +577,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
} *sp, info;
unsigned long pp;
int cpuid;
int ret = 0;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_PREPARE_PROCESS:
if (find_command_line("memdebug")) {
memcheckall();
if (runcount)
freecheck(runcount);
runcount++;
}
if((rc = process_msg_prepare_process(packet->arg)) == 0){
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
pckt.err = 0;
@ -600,19 +600,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
kprintf("No CPU available\n");
return -1;
ret = -1;
break;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1);
settid(thread, 0, cpuid, -1, 0, NULL);
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
@ -620,7 +622,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct thread *)packet->arg;
return 0;
ret = 0;
break;
/*
* Used for syscall offload reply message to explicitly schedule in
* the waiting thread
*/
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
thread = find_thread(0, packet->ttid, &lock);
if (!thread) {
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
__FUNCTION__, packet->ttid);
ret = -EINVAL;
break;
}
thread_unlock(thread, &lock);
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
__FUNCTION__, packet->ttid);
waitq_wakeup(&thread->scd_wq);
ret = 0;
break;
case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
@ -635,18 +659,25 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0;
ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid);
return 0;
ret = 0;
break;
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
@ -654,7 +685,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
return 0;
ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
@ -662,17 +694,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
return 0;
ret = 0;
break;
}
return 0;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return ret;
}
void init_host_syscall_channel(void)

View File

@ -19,11 +19,13 @@
* CPU Local Storage (cls)
*/
struct malloc_header {
unsigned int check;
struct kmalloc_header {
unsigned int front_magic;
unsigned int cpu_id;
struct malloc_header *next;
unsigned long size;
struct list_head list;
int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
};
#include <ihk/lock.h>
@ -38,8 +40,9 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var {
/* malloc */
struct malloc_header free_list;
struct malloc_header *remote_free_list;
struct list_head free_list;
struct list_head remote_free_list;
ihk_spinlock_t remote_free_list_lock;
struct thread idle;
struct process idle_proc;
@ -73,6 +76,7 @@ struct cpu_local_var {
int in_interrupt;
int no_preempt;
int timer_enabled;
int kmalloc_initialized;
} __attribute__((aligned(64)));

View File

@ -32,11 +32,10 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void __kfree(void *ptr);
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
void ___kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free);
int memcheckall();
int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#endif

View File

@ -141,6 +141,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
struct shmid_ds;
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
int zeroobj_create(struct memobj **objp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags);
#endif /* HEADER_MEMOBJ_H */

View File

@ -29,6 +29,7 @@
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
#define VR_DONTFORK 0x800
#define VR_DEMAND_PAGING 0x1000
#define VR_PRIVATE 0x2000
#define VR_LOCKED 0x4000
@ -160,7 +161,7 @@
#endif
#define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25
#define KERNEL_STACK_NR_PAGES 32
#define NOPHYS ((uintptr_t)-1)
@ -319,12 +320,14 @@ struct process_vm;
struct mckfd {
struct mckfd *next;
int fd;
int sig_no;
long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*fcntl_cb)(struct mckfd *, ihk_mc_user_context_t *);
};
#define SFD_CLOEXEC 02000000
@ -346,6 +349,11 @@ struct sig_pending {
typedef void pgio_func_t(void *arg);
struct mcexec_tid {
int tid;
struct thread *thread;
};
/* Represents a node in the process fork tree, it may exist even after the
* corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a
@ -360,6 +368,9 @@ struct process {
// threads and children
struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* TID set of proxy process */
struct mcexec_tid *tids;
int nr_tids;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
@ -556,6 +567,9 @@ struct thread {
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
/* Syscall offload wait queue head */
struct waitq scd_wq;
};
struct process_vm {
@ -675,5 +689,8 @@ void chain_process(struct process *);
void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif

View File

@ -31,6 +31,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@ -117,28 +118,6 @@ struct user_desc {
unsigned int lm:1;
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
};
struct program_image_section {
unsigned long vaddr;
unsigned long len;
@ -210,13 +189,58 @@ struct ikc_scd_init_param {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;

View File

@ -371,7 +371,7 @@ int main(void)
}
kmsg_init(mode);
kputs("MCK started.\n");
kputs("IHK/McKernel started.\n");
arch_init();
@ -393,7 +393,7 @@ int main(void)
futex_init();
kputs("MCK/IHK booted.\n");
kputs("IHK/McKernel booted.\n");
#ifdef DCFA_KMOD
mc_cmd_client_init();

View File

@ -156,13 +156,17 @@ void sbox_write(int offset, unsigned int value);
static void query_free_mem_interrupt_handler(void *priv)
{
#ifdef ATTACHED_MIC
dkprintf("query free mem handler!\n");
int pages = ihk_pagealloc_query_free(pa_allocator);
dkprintf("free pages: %d\n", pages);
kprintf("McKernel free pages: %d\n", pages);
if (find_command_line("memdebug")) {
extern void kmalloc_memcheck(void);
kmalloc_memcheck();
}
#ifdef ATTACHED_MIC
sbox_write(SBOX_SCRATCH0, pages);
sbox_write(SBOX_SCRATCH1, 1);
#endif
@ -265,6 +269,13 @@ void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long tsc;
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
#endif
if (flush_entry->addr) {
flush_tlb_single(flush_entry->addr & PAGE_MASK);
}
/* Zero address denotes full TLB flush */
else {
flush_tlb();
}
/* Wait for all cores */
while (ihk_atomic_read(&flush_entry->pending) != 0) {
@ -335,10 +346,9 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
// no return
}
kprintf("[%d]page_fault_handler(%p,%lx,%p):"
"fault vm failed. %d, TID: %d\n",
ihk_mc_get_processor_id(), fault_addr,
reason, regs, error, thread->tid);
kprintf("%s fault VM failed for TID: %d, addr: 0x%lx, "
"reason: %d, error: %d\n", __FUNCTION__,
thread->tid, fault_addr, reason, error);
unhandled_page_fault(thread, fault_addr, regs);
preempt_enable();
memset(&info, '\0', sizeof info);
@ -425,8 +435,9 @@ static void page_allocator_init(void)
ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages);
kprintf("Available pages: %ld pages\n",
ihk_pagealloc_count(pa_allocator));
kprintf("Available memory: %ld bytes in %ld pages\n",
(ihk_pagealloc_count(pa_allocator) * PAGE_SIZE),
ihk_pagealloc_count(pa_allocator));
/* Notify the ihk to use my page allocator */
ihk_mc_set_page_allocator(&allocator);
@ -507,6 +518,9 @@ static void page_init(void)
static char *memdebug = NULL;
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
static void ___kfree(void *ptr);
void register_kmalloc(void)
{
if(memdebug){
@ -636,60 +650,100 @@ void mem_init(void)
}
}
struct location {
struct location *next;
int line;
int cnt;
char file[0];
};
#define KMALLOC_TRACK_HASH_SHIFT (8)
#define KMALLOC_TRACK_HASH_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_TRACK_HASH_MASK (KMALLOC_TRACK_HASH_SIZE - 1)
struct alloc {
struct alloc *next;
struct malloc_header *p;
struct location *loc;
int size;
struct list_head kmalloc_track_hash[KMALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t kmalloc_track_hash_locks[KMALLOC_TRACK_HASH_SIZE];
struct list_head kmalloc_addr_hash[KMALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t kmalloc_addr_hash_locks[KMALLOC_TRACK_HASH_SIZE];
int kmalloc_track_initialized = 0;
int kmalloc_runcount = 0;
struct kmalloc_track_addr_entry {
void *addr;
int runcount;
struct list_head list; /* track_entry's list */
struct kmalloc_track_entry *entry;
struct list_head hash; /* address hash */
};
#define HASHNUM 129
struct kmalloc_track_entry {
char *file;
int line;
int size;
ihk_atomic_t alloc_count;
struct list_head hash;
struct list_head addr_list;
ihk_spinlock_t addr_list_lock;
};
static struct alloc *allochash[HASHNUM];
static struct location *lochash[HASHNUM];
static ihk_spinlock_t alloclock;
int runcount;
static unsigned char *page;
static int space;
static void *dalloc(unsigned long size)
void kmalloc_init(void)
{
void *r;
static int pos = 0;
unsigned long irqstate;
struct cpu_local_var *v = get_this_cpu_local_var();
irqstate = ihk_mc_spinlock_lock(&alloclock);
size = (size + 7) & 0xfffffffffffffff8L;
if (pos + size > space) {
page = allocate_pages(1, IHK_MC_AP_NOWAIT);
space = 4096;
pos = 0;
register_kmalloc();
INIT_LIST_HEAD(&v->free_list);
INIT_LIST_HEAD(&v->remote_free_list);
ihk_mc_spinlock_init(&v->remote_free_list_lock);
v->kmalloc_initialized = 1;
if (!kmalloc_track_initialized) {
int i;
memdebug = find_command_line("memdebug");
kmalloc_track_initialized = 1;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
ihk_mc_spinlock_init(&kmalloc_track_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_track_hash[i]);
ihk_mc_spinlock_init(&kmalloc_addr_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_addr_hash[i]);
}
}
r = page + pos;
pos += size;
ihk_mc_spinlock_unlock(&alloclock, irqstate);
return r;
}
/* NOTE: Hash lock must be held */
struct kmalloc_track_entry *__kmalloc_track_find_entry(
int size, char *file, int line)
{
struct kmalloc_track_entry *entry_iter, *entry = NULL;
int hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
list_for_each_entry(entry_iter, &kmalloc_track_hash[hash], hash) {
if (!strcmp(entry_iter->file, file) &&
entry_iter->size == size &&
entry_iter->line == line) {
entry = entry_iter;
break;
}
}
if (entry) {
dkprintf("%s found entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
else {
dkprintf("%s couldn't find entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
return entry;
}
/* Top level routines called from macro */
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
{
char *r = ___kmalloc(size, flag);
struct malloc_header *h;
unsigned long hash;
char *t;
struct location *lp;
struct alloc *ap;
unsigned long alcsize;
unsigned long chksize;
unsigned long irqflags;
struct kmalloc_track_entry *entry;
struct kmalloc_track_addr_entry *addr_entry;
int hash, addr_hash;
void *r = ___kmalloc(size, flag);
if (!memdebug)
return r;
@ -697,177 +751,177 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
if (!r)
return r;
h = ((struct malloc_header *)r) - 1;
alcsize = h->size * sizeof(struct malloc_header);
chksize = alcsize - size;
memset(r + size, '\x5a', chksize);
hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
for (hash = 0, t = file; *t; t++) {
hash <<= 1;
hash += *t;
entry = __kmalloc_track_find_entry(size, file, line);
if (!entry) {
entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT);
if (!entry) {
kprintf("%s: ERROR: allocating tracking entry\n");
goto out;
}
entry->line = line;
entry->size = size;
ihk_atomic_set(&entry->alloc_count, 0);
ihk_mc_spinlock_init(&entry->addr_list_lock);
INIT_LIST_HEAD(&entry->addr_list);
entry->file = ___kmalloc(strlen(file) + 1, IHK_MC_AP_NOWAIT);
if (!entry->file) {
kprintf("%s: ERROR: allocating file string\n");
___kfree(entry);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
goto out;
}
strcpy(entry->file, file);
entry->file[strlen(file)] = 0;
list_add(&entry->hash, &kmalloc_track_hash[hash]);
dkprintf("%s entry %s:%d size: %d added\n", __FUNCTION__,
file, line, size);
}
hash += line;
hash %= HASHNUM;
for (lp = lochash[hash]; lp; lp = lp->next)
if (lp->line == line &&
!strcmp(lp->file, file))
break;
if (!lp) {
lp = dalloc(sizeof(struct location) + strlen(file) + 1);
memset(lp, '\0', sizeof(struct location));
lp->line = line;
strcpy(lp->file, file);
do {
lp->next = lochash[hash];
} while (!compare_and_swap(lochash + hash, (unsigned long)lp->next, (unsigned long)lp));
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
ihk_atomic_inc(&entry->alloc_count);
/* Add new addr entry for this allocation entry */
addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT);
if (!addr_entry) {
kprintf("%s: ERROR: allocating addr entry\n");
goto out;
}
hash = (unsigned long)h % HASHNUM;
do {
for (ap = allochash[hash]; ap; ap = ap->next)
if (!ap->p)
break;
} while (ap && !compare_and_swap(&ap->p, 0UL, (unsigned long)h));
if (!ap) {
ap = dalloc(sizeof(struct alloc));
memset(ap, '\0', sizeof(struct alloc));
ap->p = h;
do {
ap->next = allochash[hash];
} while (!compare_and_swap(allochash + hash, (unsigned long)ap->next, (unsigned long)ap));
}
addr_entry->addr = r;
addr_entry->runcount = kmalloc_runcount;
addr_entry->entry = entry;
ap->loc = lp;
ap->size = size;
ap->runcount = runcount;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_add(&addr_entry->list, &entry->addr_list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
return r;
}
/* Add addr entry to address hash */
addr_hash = ((unsigned long)r >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry->hash, &kmalloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[addr_hash], irqflags);
int _memcheck(void *ptr, char *msg, char *file, int line, int flags)
{
struct malloc_header *h = ((struct malloc_header *)ptr) - 1;
struct malloc_header *next;
unsigned long hash = (unsigned long)h % HASHNUM;
struct alloc *ap;
static unsigned long check = 0x5a5a5a5a5a5a5a5aUL;
unsigned long alcsize;
unsigned long chksize;
if (h->check != 0x5a5a5a5a) {
int i;
unsigned long max = 0;
unsigned long cur = (unsigned long)h;
struct alloc *maxap = NULL;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if ((unsigned long)ap->p < cur &&
(unsigned long)ap->p > max) {
max = (unsigned long)ap->p;
maxap = ap;
}
kprintf("%s: detect buffer overrun, alc=%s:%d size=%ld h=%p, s=%ld\n", msg, maxap->loc->file, maxap->loc->line, maxap->size, maxap->p, maxap->p->size);
kprintf("broken header: h=%p next=%p size=%ld cpu_id=%d\n", h, h->next, h->size, h->cpu_id);
}
for (ap = allochash[hash]; ap; ap = ap->next)
if (ap->p == h)
break;
if (!ap) {
if(file)
kprintf("%s: address not found, %s:%d p=%p\n", msg, file, line, ptr);
else
kprintf("%s: address not found p=%p\n", msg, ptr);
return 1;
}
alcsize = h->size * sizeof(struct malloc_header);
chksize = alcsize - ap->size;
if (chksize > 8)
chksize = 8;
next = (struct malloc_header *)((char *)ptr + alcsize);
if (next->check != 0x5a5a5a5a ||
memcmp((char *)ptr + ap->size, &check, chksize)) {
unsigned long buf = 0x5a5a5a5a5a5a5a5aUL;
unsigned char *p;
unsigned char *q;
memcpy(&buf, (char *)ptr + ap->size, chksize);
p = (unsigned char *)&(next->check);
q = (unsigned char *)&buf;
if (file)
kprintf("%s: broken, %s:%d alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, file, line, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
else
kprintf("%s: broken, alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
if (next->check != 0x5a5a5a5a)
kprintf("next->HEADER: next=%p size=%ld cpu_id=%d\n", next->next, next->size, next->cpu_id);
return 1;
}
if(flags & 1){
ap->p = NULL;
ap->loc = NULL;
ap->size = 0;
}
return 0;
}
int memcheckall()
{
int i;
struct alloc *ap;
int r = 0;
for(i = 0; i < HASHNUM; i++)
for(ap = allochash[i]; ap; ap = ap->next)
if(ap->p)
r |= _memcheck(ap->p + 1, "memcheck", NULL, 0, 2);
return r;
}
int freecheck(int runcount)
{
int i;
struct alloc *ap;
struct location *lp;
int r = 0;
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
lp->cnt = 0;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if (ap->p && ap->runcount == runcount) {
ap->loc->cnt++;
r++;
}
if (r) {
kprintf("memory leak?\n");
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
if (lp->cnt)
kprintf(" alc=%s:%d cnt=%d\n", lp->file, lp->line, lp->cnt);
}
dkprintf("%s addr_entry %p added\n", __FUNCTION__, r);
out:
return r;
}
void _kfree(void *ptr, char *file, int line)
{
if (memdebug)
_memcheck(ptr, "KFREE", file, line, 1);
unsigned long irqflags;
struct kmalloc_track_entry *entry;
struct kmalloc_track_addr_entry *addr_entry_iter, *addr_entry = NULL;
int hash;
if (!memdebug) {
goto out;
}
hash = ((unsigned long)ptr >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[hash]);
list_for_each_entry(addr_entry_iter,
&kmalloc_addr_hash[hash], hash) {
if (addr_entry_iter->addr == ptr) {
addr_entry = addr_entry_iter;
break;
}
}
if (addr_entry) {
list_del(&addr_entry->hash);
}
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[hash], irqflags);
if (!addr_entry) {
kprintf("%s: ERROR: kfree()ing invalid pointer\n", __FUNCTION__);
panic("panic");
}
entry = addr_entry->entry;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_del(&addr_entry->list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
dkprintf("%s addr_entry %p removed\n", __FUNCTION__, addr_entry->addr);
___kfree(addr_entry);
/* Do we need to remove tracking entry as well? */
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
goto out;
}
hash = (strlen(entry->file) + entry->line + entry->size) &
KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
list_del(&entry->hash);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
dkprintf("%s entry %s:%d size: %d removed\n", __FUNCTION__,
entry->file, entry->line, entry->size);
___kfree(entry->file);
___kfree(entry);
out:
___kfree(ptr);
}
void kmalloc_memcheck(void)
{
int i;
unsigned long irqflags;
struct kmalloc_track_entry *entry = NULL;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[i]);
list_for_each_entry(entry, &kmalloc_track_hash[i], hash) {
struct kmalloc_track_addr_entry *addr_entry = NULL;
int cnt = 0;
ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock);
list_for_each_entry(addr_entry, &entry->addr_list, list) {
dkprintf("%s memory leak: %p @ %s:%d size: %d runcount: %d\n",
__FUNCTION__,
addr_entry->addr,
entry->file,
entry->line,
entry->size,
addr_entry->runcount);
if (kmalloc_runcount != addr_entry->runcount)
continue;
cnt++;
}
ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock);
if (!cnt)
continue;
kprintf("%s memory leak: %s:%d size: %d cnt: %d, runcount: %d\n",
__FUNCTION__,
entry->file,
entry->line,
entry->size,
cnt,
kmalloc_runcount);
}
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[i], irqflags);
}
++kmalloc_runcount;
}
/* Redirection routines registered in alloc structure */
void *__kmalloc(int size, enum ihk_mc_ap_flag flag)
{
return kmalloc(size, flag);
@ -878,160 +932,199 @@ void __kfree(void *ptr)
kfree(ptr);
}
void kmalloc_init(void)
static void ___kmalloc_insert_chunk(struct list_head *free_list,
struct kmalloc_header *chunk)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list;
int i;
struct kmalloc_header *chunk_iter, *next_chunk = NULL;
h->check = 0x5a5a5a5a;
h->next = &v->free_list;
h->size = 0;
register_kmalloc();
memdebug = find_command_line("memdebug");
for (i = 0; i < HASHNUM; i++) {
allochash[i] = NULL;
lochash[i] = NULL;
}
page = allocate_pages(16, IHK_MC_AP_NOWAIT);
space = 16 * 4096;
ihk_mc_spinlock_init(&alloclock);
}
void ____kfree(struct cpu_local_var *v, struct malloc_header *p)
{
struct malloc_header *h = &v->free_list;
int combined = 0;
h = h->next;
while ((p < h || p > h->next) && h != &v->free_list) {
h = h->next;
}
if (h + h->size + 1 == p && h->size != 0) {
combined = 1;
h->size += p->size + 1;
h->check = 0x5a5a5a5a;
}
if (h->next == p + p->size + 1 && h->next->size != 0) {
if (combined) {
h->check = 0x5a5a5a5a;
h->size += h->next->size + 1;
h->next = h->next->next;
} else {
p->check = 0x5a5a5a5a;
p->size += h->next->size + 1;
p->next = h->next->next;
h->next = p;
/* Find out where to insert */
list_for_each_entry(chunk_iter, free_list, list) {
if ((void *)chunk < (void *)chunk_iter) {
next_chunk = chunk_iter;
break;
}
} else if (!combined) {
p->next = h->next;
h->next = p;
}
/* Add in front of next */
if (next_chunk) {
list_add_tail(&chunk->list, &next_chunk->list);
}
/* Add after the head */
else {
list_add(&chunk->list, free_list);
}
return;
}
void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
static void ___kmalloc_init_chunk(struct kmalloc_header *h, int size)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list, *prev, *p;
int u, req_page;
h->size = size;
h->front_magic = 0x5c5c5c5c;
h->end_magic = 0x6d6d6d6d;
h->cpu_id = ihk_mc_get_processor_id();
}
p = (struct malloc_header *)xchg8((unsigned long *)&v->remote_free_list, 0L);
while(p){
struct malloc_header *n = p->next;
____kfree(v, p);
p = n;
static void ___kmalloc_consolidate_list(struct list_head *list)
{
struct kmalloc_header *chunk_iter, *chunk, *next_chunk;
reiterate:
chunk_iter = NULL;
chunk = NULL;
list_for_each_entry(next_chunk, list, list) {
if (chunk_iter && (((void *)chunk_iter + sizeof(struct kmalloc_header)
+ chunk_iter->size) == (void *)next_chunk)) {
chunk = chunk_iter;
break;
}
chunk_iter = next_chunk;
}
if (size >= PAGE_SIZE * 4) {
if (!chunk) {
return;
}
chunk->size += (next_chunk->size + sizeof(struct kmalloc_header));
list_del(&next_chunk->list);
goto reiterate;
}
void kmalloc_consolidate_free_list(void)
{
struct kmalloc_header *chunk, *tmp;
unsigned long irqflags =
ihk_mc_spinlock_lock(&cpu_local_var(remote_free_list_lock));
/* Clean up remotely deallocated chunks */
list_for_each_entry_safe(chunk, tmp,
&cpu_local_var(remote_free_list), list) {
list_del(&chunk->list);
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
}
/* Free list lock ensures IRQs are disabled */
___kmalloc_consolidate_list(&cpu_local_var(free_list));
ihk_mc_spinlock_unlock(&cpu_local_var(remote_free_list_lock), irqflags);
}
#define KMALLOC_MIN_SHIFT (5)
#define KMALLOC_MIN_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_MIN_MASK (KMALLOC_MIN_SIZE - 1)
/* Actual low-level allocation routines */
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
{
struct kmalloc_header *chunk_iter;
struct kmalloc_header *chunk = NULL;
int npages;
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
/* KMALLOC_MIN_SIZE bytes aligned size. */
if (size & KMALLOC_MIN_MASK) {
size = ((size + KMALLOC_MIN_SIZE - 1) & ~(KMALLOC_MIN_MASK));
}
chunk = NULL;
/* Find a chunk that is big enough */
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
if (chunk_iter->size >= size) {
chunk = chunk_iter;
break;
}
}
split_and_return:
/* Did we find one? */
if (chunk) {
/* Do we need to split it? Only if there is enough space for
* another header and some actual content */
if (chunk->size > (size + sizeof(struct kmalloc_header))) {
struct kmalloc_header *leftover;
leftover = (struct kmalloc_header *)
((void *)chunk + sizeof(struct kmalloc_header) + size);
___kmalloc_init_chunk(leftover,
(chunk->size - size - sizeof(struct kmalloc_header)));
list_add(&leftover->list, &chunk->list);
chunk->size = size;
}
list_del(&chunk->list);
cpu_restore_interrupt(kmalloc_irq_flags);
return ((void *)chunk + sizeof(struct kmalloc_header));
}
/* Allocate new memory and add it to free list */
npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1))
>> PAGE_SHIFT;
chunk = ihk_mc_alloc_pages(npages, flag);
if (!chunk) {
cpu_restore_interrupt(kmalloc_irq_flags);
return NULL;
}
u = (size + sizeof(*h) - 1) / sizeof(*h);
___kmalloc_init_chunk(chunk,
(npages * PAGE_SIZE - sizeof(struct kmalloc_header)));
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
prev = h;
h = h->next;
while (1) {
if (h == &v->free_list) {
req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
h = allocate_pages(req_page, flag);
if(h == NULL) {
kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag);
return NULL;
}
h->check = 0x5a5a5a5a;
prev->next = h;
h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2;
/* Guard entry */
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->next = &v->free_list;
p->size = 0;
h->next = p;
}
if (h->size >= u) {
if (h->size == u || h->size == u + 1) {
prev->next = h->next;
h->cpu_id = ihk_mc_get_processor_id();
return h + 1;
} else { /* Divide */
h->size -= u + 1;
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->size = u;
p->cpu_id = ihk_mc_get_processor_id();
return p + 1;
}
}
prev = h;
h = h->next;
}
goto split_and_return;
}
void ___kfree(void *ptr)
static void ___kfree(void *ptr)
{
struct malloc_header *p = (struct malloc_header *)ptr;
struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id);
struct kmalloc_header *chunk =
(struct kmalloc_header*)(ptr - sizeof(struct kmalloc_header));
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
if(p->cpu_id == ihk_mc_get_processor_id()){
____kfree(v, p);
/* Sanity check */
if (chunk->front_magic != 0x5c5c5c5c || chunk->end_magic != 0x6d6d6d6d) {
kprintf("%s: memory corruption at address 0x%p\n", __FUNCTION__, ptr);
panic("panic");
}
else{
unsigned long oldval;
unsigned long newval;
unsigned long rval;
do{
p->next = v->remote_free_list;
oldval = (unsigned long)p->next;
newval = (unsigned long)p;
rval = atomic_cmpxchg8(
(unsigned long *)&v->remote_free_list,
oldval, newval);
}while(rval != oldval);
/* Does this chunk belong to this CPU? */
if (chunk->cpu_id == ihk_mc_get_processor_id()) {
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
___kmalloc_consolidate_list(&cpu_local_var(free_list));
}
else {
struct cpu_local_var *v = get_cpu_local_var(chunk->cpu_id);
unsigned long irqflags;
irqflags = ihk_mc_spinlock_lock(&v->remote_free_list_lock);
list_add(&chunk->list, &v->remote_free_list);
ihk_mc_spinlock_unlock(&v->remote_free_list_lock, irqflags);
}
cpu_restore_interrupt(kmalloc_irq_flags);
}
void print_free_list(void)
void ___kmalloc_print_free_list(struct list_head *list)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list;
struct kmalloc_header *chunk_iter;
unsigned long irqflags = kprintf_lock();
h = h->next;
kprintf("free_list : \n");
while (h != &v->free_list) {
kprintf(" %p : %p, %d ->\n", h, h->next, h->size);
h = h->next;
__kprintf("%s: [ \n", __FUNCTION__);
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
__kprintf("%s: 0x%lx:%d (VA PFN: %lu, off: %lu)\n", __FUNCTION__,
(unsigned long)chunk_iter,
chunk_iter->size,
(unsigned long)chunk_iter >> PAGE_SHIFT,
(unsigned long)chunk_iter % PAGE_SIZE);
}
kprintf("\n");
__kprintf("%s: ] \n", __FUNCTION__);
kprintf_unlock(irqflags);
}

View File

@ -53,7 +53,6 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm);
extern void release_fp_regs(struct thread *proc);
extern void save_fp_regs(struct thread *proc);
extern void restore_fp_regs(struct thread *proc);
void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void __runq_add_proc(struct thread *proc, int cpu_id);
extern void terminate_host(int pid);
extern void lapic_timer_enable(unsigned int clocks);
@ -338,6 +337,10 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
proc = org->proc;
thread->vm = org->vm;
thread->proc = proc;
thread->sigstack.ss_sp = NULL;
thread->sigstack.ss_flags = SS_DISABLE;
thread->sigstack.ss_size = 0;
}
/* fork() */
else {
@ -383,9 +386,15 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
goto err_free_proc;
}
thread->vm->vdso_addr = org->vm->vdso_addr;
thread->vm->vvar_addr = org->vm->vvar_addr;
thread->proc->maxrss = org->proc->maxrss;
thread->vm->currss = org->vm->currss;
thread->sigstack.ss_sp = org->sigstack.ss_sp;
thread->sigstack.ss_flags = org->sigstack.ss_flags;
thread->sigstack.ss_size = org->sigstack.ss_size;
dkprintf("fork(): copy_user_ranges() OK\n");
}
@ -413,9 +422,6 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
INIT_LIST_HEAD(&thread->sigcommon->sigpending);
// TODO: copy signalfd
}
thread->sigstack.ss_sp = NULL;
thread->sigstack.ss_flags = SS_DISABLE;
thread->sigstack.ss_size = 0;
ihk_mc_spinlock_init(&thread->sigpendinglock);
INIT_LIST_HEAD(&thread->sigpending);
thread->sigmask = org->sigmask;
@ -566,6 +572,9 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm)
break;
}
if(src_range->flag & VR_DONTFORK)
continue;
range = kmalloc(sizeof(struct vm_range), IHK_MC_AP_NOWAIT);
if (!range) {
goto err_rollback;
@ -735,7 +744,7 @@ int join_process_memory_range(struct process_vm *vm,
memobj_release(merging->memobj);
}
list_del(&merging->list);
ihk_mc_free(merging);
kfree(merging);
error = 0;
out:
@ -831,8 +840,9 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
if (range->memobj) {
memobj_release(range->memobj);
}
list_del(&range->list);
ihk_mc_free(range);
kfree(range);
dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
vm, start0, end0);
@ -958,7 +968,6 @@ enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fa
return attr;
}
/* XXX: インデントを揃える必要がある */
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
@ -1529,6 +1538,8 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out;
}
dkprintf("%s: clearing 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memset(virt, 0, pgsize);
phys = virt_to_phys(virt);
page_map(phys_to_page(phys));
@ -1561,6 +1572,8 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate copy page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out;
}
dkprintf("%s: copying 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memcpy(virt, phys_to_virt(phys), pgsize);
phys = virt_to_phys(virt);
@ -1641,6 +1654,18 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
"access denied. %d\n",
ihk_mc_get_processor_id(), vm,
fault_addr0, reason, error);
kprintf("%s: reason: %s%s%s%s%s%s%s\n", __FUNCTION__,
(reason & PF_PROT) ? "PF_PROT " : "",
(reason & PF_WRITE) ? "PF_WRITE " : "",
(reason & PF_USER) ? "PF_USER " : "",
(reason & PF_RSVD) ? "PF_RSVD " : "",
(reason & PF_INSTR) ? "PF_INSTR " : "",
(reason & PF_PATCH) ? "PF_PATCH " : "",
(reason & PF_POPULATE) ? "PF_POPULATE " : "");
kprintf("%s: range->flag & (%s%s%s)\n", __FUNCTION__,
(range->flag & VR_PROT_READ) ? "VR_PROT_READ " : "",
(range->flag & VR_PROT_WRITE) ? "VR_PROT_WRITE " : "",
(range->flag & VR_PROT_EXEC) ? "VR_PROT_EXEC " : "");
if (((range->flag & VR_PROT_MASK) == VR_PROT_NONE))
kprintf("if (((range->flag & VR_PROT_MASK) == VR_PROT_NONE))\n");
if (((reason & PF_WRITE) && !(reason & PF_PATCH)))
@ -1868,14 +1893,14 @@ unsigned long extend_process_region(struct process_vm *vm,
aligned_end = (aligned_end + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
/* Fill in the gap between old_aligned_end and aligned_end
* with regular pages */
if((p = allocate_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT,
if((p = ihk_mc_alloc_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){
return end;
}
if((rc = add_process_memory_range(vm, old_aligned_end,
aligned_end, virt_to_phys(p), flag,
LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT);
return end;
}
@ -1888,7 +1913,7 @@ unsigned long extend_process_region(struct process_vm *vm,
(LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
address = aligned_new_end;
if((p = allocate_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT,
if((p = ihk_mc_alloc_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){
return end;
}
@ -1896,16 +1921,16 @@ unsigned long extend_process_region(struct process_vm *vm,
p_aligned = ((unsigned long)p + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
if (p_aligned > (unsigned long)p) {
free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT);
}
free_pages(
ihk_mc_free_pages(
(void *)(p_aligned + aligned_new_end - aligned_end),
(LARGE_PAGE_SIZE - (p_aligned - (unsigned long)p)) >> PAGE_SHIFT);
if((rc = add_process_memory_range(vm, aligned_end,
aligned_new_end, virt_to_phys((void *)p_aligned),
flag, LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT);
return end;
}
@ -1923,7 +1948,7 @@ unsigned long extend_process_region(struct process_vm *vm,
p=0;
}else{
p = allocate_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT);
p = ihk_mc_alloc_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT);
if (!p) {
return end;
@ -1932,7 +1957,7 @@ unsigned long extend_process_region(struct process_vm *vm,
if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end,
(p==0?0:virt_to_phys(p)), flag, NULL, 0,
PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT);
return end;
}
@ -2045,6 +2070,7 @@ release_process(struct process *proc)
mcs_rwlock_writer_unlock(&parent->children_lock, &lock);
}
if (proc->tids) kfree(proc->tids);
kfree(proc);
}
@ -2150,6 +2176,23 @@ release_sigcommon(struct sig_common *sigcommon)
kfree(sigcommon);
}
/*
* Release the TID from the process' TID set corresponding to this thread.
* NOTE: threads_lock must be held.
*/
void __release_tid(struct process *proc, struct thread *thread) {
int i;
for (i = 0; i < proc->nr_tids; ++i) {
if (proc->tids[i].thread != thread) continue;
proc->tids[i].thread = NULL;
dkprintf("%s: tid %d has been released by %p\n",
__FUNCTION__, thread->tid, thread);
break;
}
}
void destroy_thread(struct thread *thread)
{
struct sig_pending *pending;
@ -2166,6 +2209,7 @@ void destroy_thread(struct thread *thread)
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
list_del(&thread->siblings_list);
__release_tid(proc, thread);
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
cpu_clear(thread->cpu_id, &thread->vm->address_space->cpu_set,
@ -2303,6 +2347,8 @@ static void idle(void)
}
if (v->status == CPU_STATUS_IDLE ||
v->status == CPU_STATUS_RESERVED) {
/* No work to do? Consolidate the kmalloc free list */
kmalloc_consolidate_free_list();
cpu_safe_halt();
}
else {
@ -2491,7 +2537,6 @@ static void do_migrate(void)
cur_v->runq_len -= 1;
old_cpu_id = req->thread->cpu_id;
req->thread->cpu_id = cpu_id;
settid(req->thread, 2, cpu_id, old_cpu_id);
list_add_tail(&req->thread->sched_list, &v->runq);
v->runq_len += 1;
@ -2506,6 +2551,7 @@ static void do_migrate(void)
v->flags |= CPU_FLAG_NEED_RESCHED;
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1);
double_rq_unlock(cur_v, v, irqstate);
//settid(req->thread, 2, cpu_id, old_cpu_id, 0, NULL);
ack:
waitq_wakeup(&req->wq);
@ -2541,13 +2587,8 @@ void schedule(void)
struct thread *last;
if (cpu_local_var(no_preempt)) {
dkprintf("no schedule() while no preemption! \n");
return;
}
if (cpu_local_var(current)
&& cpu_local_var(current)->in_syscall_offload) {
dkprintf("no schedule() while syscall offload!\n");
kprintf("%s: WARNING can't schedule() while no preemption, cnt: %d\n",
__FUNCTION__, cpu_local_var(no_preempt));
return;
}
@ -2576,9 +2617,10 @@ redo:
if (v->flags & CPU_FLAG_NEED_MIGRATE) {
next = &cpu_local_var(idle);
} else {
/* Pick a new running process */
/* Pick a new running process or one that has a pending signal */
list_for_each_entry_safe(thread, tmp, &(v->runq), sched_list) {
if (thread->status == PS_RUNNING) {
if (thread->status == PS_RUNNING ||
(thread->status == PS_INTERRUPTIBLE && hassigpending(thread))) {
next = thread;
break;
}
@ -2704,9 +2746,11 @@ sched_wakeup_thread(struct thread *thread, int valid_states)
int spin_slept = 0;
unsigned long irqstate;
struct cpu_local_var *v = get_cpu_local_var(thread->cpu_id);
struct process *proc = thread->proc;
struct mcs_rwlock_node updatelock;
dkprintf("sched_wakeup_process,proc->pid=%d,valid_states=%08x,proc->status=%08x,proc->cpu_id=%d,my cpu_id=%d\n",
thread->proc->pid, valid_states, thread->status, thread->cpu_id, ihk_mc_get_processor_id());
proc->pid, valid_states, thread->status, thread->cpu_id, ihk_mc_get_processor_id());
irqstate = ihk_mc_spinlock_lock(&(thread->spin_sleep_lock));
if (thread->spin_sleep > 0) {
@ -2726,7 +2770,10 @@ sched_wakeup_thread(struct thread *thread, int valid_states)
irqstate = ihk_mc_spinlock_lock(&(v->runq_lock));
if (thread->status & valid_states) {
xchg4((int *)(&thread->proc->status), PS_RUNNING);
mcs_rwlock_writer_lock_noirq(&proc->update_lock, &updatelock);
if(proc->status != PS_EXITED)
proc->status = PS_RUNNING;
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
xchg4((int *)(&thread->status), PS_RUNNING);
status = 0;
}

View File

@ -281,6 +281,13 @@ process_procfs_request(unsigned long rarg)
ans = -EIO;
goto end;
}
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
ans = -EIO;
goto end;
}
va = phys_to_virt(pa);
if(readwrite)
memcpy(va, buf + ans, size);

View File

@ -105,7 +105,6 @@ static void calculate_time_from_tsc(struct timespec *ts);
void check_signal(unsigned long, void *, int);
void do_signal(long rc, void *regs, struct thread *thread, struct sig_pending *pending, int num);
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern struct sigpending *hassigpending(struct thread *thread);
extern long alloc_debugreg(struct thread *thread);
extern int num_processors;
extern unsigned long ihk_mc_get_ns_per_tsc(void);
@ -128,11 +127,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
static void do_mod_exit(int status);
#endif
static void send_syscall(struct syscall_request *req, int cpu, int pid)
static void send_syscall(struct syscall_request *req, int cpu, int pid, struct syscall_response *res)
{
struct ikc_scd_packet packet;
struct syscall_response *res;
struct syscall_params *scp;
struct ikc_scd_packet packet IHK_DMA_ALIGN;
struct ihk_ikc_channel_desc *syscall_channel;
int ret;
@ -141,7 +138,6 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid)
req->number == __NR_kill){ // interrupt syscall
extern int num_processors;
scp = &get_cpu_local_var(0)->scp2;
syscall_channel = get_cpu_local_var(0)->syscall_channel2;
/* XXX: is this really going to work if multiple processes
@ -153,34 +149,22 @@ static void send_syscall(struct syscall_request *req, int cpu, int pid)
pid = req->args[1];
}
else{
scp = &get_cpu_local_var(cpu)->scp;
syscall_channel = get_cpu_local_var(cpu)->syscall_channel;
}
res = scp->response_va;
res->status = 0;
req->valid = 0;
#ifdef USE_DMA
memcpy_async(scp->request_pa,
virt_to_phys(req), sizeof(*req), 0, &fin);
memcpy_async_wait(&scp->post_fin);
scp->post_va->v[0] = scp->post_idx;
memcpy_async_wait(&fin);
#else
memcpy(scp->request_va, req, sizeof(*req));
#endif
memcpy(&packet.req, req, sizeof(*req));
barrier();
scp->request_va->valid = 1;
*(unsigned int *)scp->doorbell_va = cpu + 1;
packet.req.valid = 1;
#ifdef SYSCALL_BY_IKC
packet.msg = SCD_MSG_SYSCALL_ONESIDE;
packet.ref = cpu;
packet.pid = pid ? pid : cpu_local_var(current)->proc->pid;
packet.arg = scp->request_rpa;
packet.resp_pa = virt_to_phys(res);
dkprintf("send syscall, nr: %d, pid: %d\n", req->number, packet.pid);
ret = ihk_ikc_send(syscall_channel, &packet, 0);
@ -194,9 +178,8 @@ ihk_spinlock_t syscall_lock;
long do_syscall(struct syscall_request *req, int cpu, int pid)
{
struct syscall_response *res;
struct syscall_response res;
struct syscall_request req2 IHK_DMA_ALIGN;
struct syscall_params *scp;
int error;
long rc;
int islock = 0;
@ -207,6 +190,9 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
dkprintf("SC(%d)[%3d] sending syscall\n",
ihk_mc_get_processor_id(),
req->number);
irqstate = 0; /* for avoidance of warning */
barrier();
if(req->number != __NR_exit_group){
if(proc->nohost && // host is down
@ -216,55 +202,102 @@ long do_syscall(struct syscall_request *req, int cpu, int pid)
++thread->in_syscall_offload;
}
irqstate = 0; /* for avoidance of warning */
if(req->number == __NR_exit_group ||
req->number == __NR_gettid ||
req->number == __NR_kill){ // interrupt syscall
scp = &get_cpu_local_var(0)->scp2;
islock = 1;
irqstate = ihk_mc_spinlock_lock(&syscall_lock);
}
else{
scp = &get_cpu_local_var(cpu)->scp;
}
res = scp->response_va;
/* The current thread is the requester and any thread from
* the pool may serve the request */
req->rtid = cpu_local_var(current)->tid;
req->ttid = 0;
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
send_syscall(req, cpu, pid, &res);
send_syscall(req, cpu, pid);
dkprintf("SC(%d)[%3d] waiting for host.. \n",
ihk_mc_get_processor_id(),
req->number);
dkprintf("%s: syscall num: %d waiting for Linux.. \n",
__FUNCTION__, req->number);
#define STATUS_IN_PROGRESS 0
#define STATUS_COMPLETED 1
#define STATUS_PAGE_FAULT 3
while (res->status != STATUS_COMPLETED) {
while (res->status == STATUS_IN_PROGRESS) {
while (res.status != STATUS_COMPLETED) {
while (res.status == STATUS_IN_PROGRESS) {
struct cpu_local_var *v;
int do_schedule = 0;
long runq_irqstate;
unsigned long flags;
DECLARE_WAITQ_ENTRY(scd_wq_entry, cpu_local_var(current));
cpu_pause();
/* Spin if not preemptable */
if (cpu_local_var(no_preempt) || !thread->tid) {
continue;
}
/* Spin by default, but if re-schedule is requested let
* the other thread run */
runq_irqstate =
ihk_mc_spinlock_lock(&(get_this_cpu_local_var()->runq_lock));
v = get_this_cpu_local_var();
if (v->flags & CPU_FLAG_NEED_RESCHED) {
do_schedule = 1;
}
ihk_mc_spinlock_unlock(&v->runq_lock, runq_irqstate);
if (!do_schedule) {
continue;
}
flags = cpu_disable_interrupt_save();
/* Try to sleep until notified */
if (__sync_bool_compare_and_swap(&res.req_thread_status,
IHK_SCD_REQ_THREAD_SPINNING,
IHK_SCD_REQ_THREAD_DESCHEDULED)) {
dkprintf("%s: tid %d waiting for syscall reply...\n",
__FUNCTION__, thread->tid);
waitq_init(&thread->scd_wq);
waitq_prepare_to_wait(&thread->scd_wq, &scd_wq_entry,
PS_INTERRUPTIBLE);
cpu_restore_interrupt(flags);
schedule();
waitq_finish_wait(&thread->scd_wq, &scd_wq_entry);
}
cpu_restore_interrupt(flags);
}
if (res->status == STATUS_PAGE_FAULT) {
if (res.status == STATUS_PAGE_FAULT) {
dkprintf("STATUS_PAGE_FAULT in syscall, pid: %d\n",
cpu_local_var(current)->proc->pid);
error = page_fault_process_vm(thread->vm,
(void *)res->fault_address,
res->fault_reason|PF_POPULATE);
(void *)res.fault_address,
res.fault_reason|PF_POPULATE);
/* send result */
req2.number = __NR_mmap;
#define PAGER_RESUME_PAGE_FAULT 0x0101
req2.args[0] = PAGER_RESUME_PAGE_FAULT;
req2.args[1] = error;
/* The current thread is the requester and only the waiting thread
* may serve the request */
req2.rtid = cpu_local_var(current)->tid;
req2.ttid = res.stid;
send_syscall(&req2, cpu, pid);
res.req_thread_status = IHK_SCD_REQ_THREAD_SPINNING;
send_syscall(&req2, cpu, pid, &res);
}
}
dkprintf("SC(%d)[%3d] got host reply: %d \n",
ihk_mc_get_processor_id(),
req->number, res->ret);
dkprintf("%s: syscall num: %d got host reply: %d \n",
__FUNCTION__, req->number, res.ret);
rc = res->ret;
rc = res.ret;
if(islock){
ihk_mc_spinlock_unlock(&syscall_lock, irqstate);
}
@ -795,7 +828,8 @@ terminate(int rc, int sig)
release_thread(mythread);
release_process_vm(vm);
schedule();
// no return
kprintf("%s: ERROR: returned from terminate() -> schedule()\n", __FUNCTION__);
panic("panic");
}
void
@ -813,14 +847,15 @@ terminate_host(int pid)
}
void
interrupt_syscall(int pid, int cpuid)
interrupt_syscall(int pid, int tid)
{
dkprintf("interrupt_syscall,target pid=%d,target cpuid=%d\n", pid, cpuid);
dkprintf("interrupt_syscall,target pid=%d,target tid=%d\n", pid, tid);
ihk_mc_user_context_t ctx;
long lerror;
kprintf("interrupt_syscall pid=%d tid=%d\n", pid, tid);
ihk_mc_syscall_arg0(&ctx) = pid;
ihk_mc_syscall_arg1(&ctx) = cpuid;
ihk_mc_syscall_arg1(&ctx) = tid;
lerror = syscall_generic_forwarding(__NR_kill, &ctx);
if (lerror) {
@ -883,8 +918,6 @@ static int do_munmap(void *addr, size_t len)
begin_free_pages_pending();
error = remove_process_memory_range(cpu_local_var(current)->vm,
(intptr_t)addr, (intptr_t)addr+len, &ro_freed);
// XXX: TLB flush
flush_tlb();
if (error || !ro_freed) {
clear_host_pte((uintptr_t)addr, len);
}
@ -896,6 +929,8 @@ static int do_munmap(void *addr, size_t len)
}
}
finish_free_pages_pending();
dkprintf("%s: 0x%lx:%lu, error: %ld\n",
__FUNCTION__, addr, len, error);
return error;
}
@ -1043,31 +1078,29 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
vrflags |= PROT_TO_VR_FLAG(prot);
vrflags |= (flags & MAP_PRIVATE)? VR_PRIVATE: 0;
vrflags |= (flags & MAP_LOCKED)? VR_LOCKED: 0;
vrflags |= VR_DEMAND_PAGING;
if (flags & MAP_ANONYMOUS) {
if (0) {
/* dummy */
if (!anon_on_demand) {
populated_mapping = 1;
}
#ifdef USE_NOCACHE_MMAP
#define X_MAP_NOCACHE MAP_32BIT
else if (flags & X_MAP_NOCACHE) {
vrflags &= ~VR_DEMAND_PAGING;
vrflags |= VR_IO_NOCACHE;
}
#endif
else {
vrflags |= VR_DEMAND_PAGING;
if (!anon_on_demand) {
populated_mapping = 1;
}
}
}
else {
vrflags |= VR_DEMAND_PAGING;
}
if (flags & (MAP_POPULATE | MAP_LOCKED)) {
populated_mapping = 1;
}
/* XXX: Intel MPI 128MB mapping.. */
if (len == 134217728) {
populated_mapping = 0;
}
if (!(prot & PROT_WRITE)) {
error = set_host_vma(addr, len, PROT_READ);
if (error) {
@ -1097,7 +1130,7 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
}
#endif
if (error == -ESRCH) {
kprintf("do_mmap:hit non VREG\n");
dkprintf("do_mmap:hit non VREG\n");
/*
* XXX: temporary:
*
@ -1108,10 +1141,17 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
vrflags &= ~VR_MEMTYPE_MASK;
vrflags |= VR_MEMTYPE_UC;
}
error = devobj_create(fd, len, off, &memobj, &maxprot);
error = devobj_create(fd, len, off, &memobj, &maxprot,
prot, (flags & (MAP_POPULATE | MAP_LOCKED)));
if (!error) {
dkprintf("%s: device fd: %d off: %lu mapping at %p - %p\n",
__FUNCTION__, fd, off, addr, addr + len);
}
}
if (error) {
ekprintf("do_mmap:fileobj_create failed. %d\n", error);
kprintf("%s: error: file mapping failed, fd: %d, error: %d\n",
__FUNCTION__, error);
goto out;
}
}
@ -1125,6 +1165,8 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
error = -ENOMEM;
goto out;
}
dkprintf("%s: 0x%x:%lu allocated %d pages, p2align: %lx\n",
__FUNCTION__, addr, len, npages, p2align);
phys = virt_to_phys(p);
}
else if (flags & MAP_SHARED) {
@ -1160,10 +1202,10 @@ do_mmap(const intptr_t addr0, const size_t len0, const int prot,
error = add_process_memory_range(thread->vm, addr, addr+len, phys,
vrflags, memobj, off, pgshift);
if (error) {
ekprintf("do_mmap:add_process_memory_range"
"(%p,%lx,%lx,%lx,%lx,%d) failed %d\n",
thread->vm, addr, addr+len,
virt_to_phys(p), vrflags, pgshift, error);
kprintf("%s: add_process_memory_range failed for 0x%lx:%lu"
" flags: %lx, vrflags: %lx, pgshift: %d, error: %d\n",
__FUNCTION__, addr, addr+len,
flags, vrflags, pgshift, error);
goto out;
}
@ -1181,9 +1223,12 @@ out:
if (!error && populated_mapping) {
error = populate_process_memory(thread->vm, (void *)addr, len);
if (error) {
ekprintf("do_mmap:populate_process_memory"
"(%p,%p,%lx) failed %d\n",
thread->vm, (void *)addr, len, error);
ekprintf("%s: error :populate_process_memory"
"vm: %p, addr: %p, len: %d (flags: %s%s) failed %d\n", __FUNCTION__,
thread->vm, (void *)addr, len,
(flags & MAP_POPULATE) ? "MAP_POPULATE " : "",
(flags & MAP_LOCKED) ? "MAP_LOCKED ": "",
error);
/*
* In this case,
* the mapping established by this call should be unmapped
@ -1206,8 +1251,12 @@ out:
if (memobj) {
memobj_release(memobj);
}
dkprintf("do_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n",
addr0, len0, prot, flags, fd, off0, error, addr);
dkprintf("%s: 0x%lx:%8lu, (req: 0x%lx:%lu), prot: %x, flags: %x, "
"fd: %d, off: %lu, error: %ld, addr: 0x%lx\n",
__FUNCTION__,
addr, len, addr0, len0, prot, flags,
fd, off0, error, addr);
return (!error)? addr: error;
}
@ -1438,8 +1487,8 @@ SYSCALL_DECLARE(getppid)
return thread->proc->ppid_parent->pid;
}
void
settid(struct thread *thread, int mode, int newcpuid, int oldcpuid)
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids)
{
struct syscall_request request IHK_DMA_ALIGN;
unsigned long rc;
@ -1449,6 +1498,12 @@ settid(struct thread *thread, int mode, int newcpuid, int oldcpuid)
request.args[1] = thread->proc->pid;
request.args[2] = newcpuid;
request.args[3] = oldcpuid;
/*
* If nr_tids is non-zero, tids should point to an array of ints
* where the thread ids of the mcexec process are expected.
*/
request.args[4] = nr_tids;
request.args[5] = virt_to_phys(tids);
rc = do_syscall(&request, ihk_mc_get_processor_id(), thread->proc->pid);
if (mode != 2) {
thread->tid = rc;
@ -1853,7 +1908,61 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
&new->vm->address_space->cpu_set_lock);
if (clone_flags & CLONE_VM) {
settid(new, 1, cpuid, -1);
int *tids = NULL;
int i;
struct mcs_rwlock_node_irqsave lock;
mcs_rwlock_writer_lock(&newproc->threads_lock, &lock);
/* Obtain mcexec TIDs if not known yet */
if (!newproc->nr_tids) {
tids = kmalloc(sizeof(int) * num_processors, IHK_MC_AP_NOWAIT);
if (!tids) {
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
release_cpuid(cpuid);
return -ENOMEM;
}
newproc->tids = kmalloc(sizeof(struct mcexec_tid) * num_processors, IHK_MC_AP_NOWAIT);
if (!newproc->tids) {
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
kfree(tids);
release_cpuid(cpuid);
return -ENOMEM;
}
settid(new, 1, cpuid, -1, num_processors, tids);
for (i = 0; (i < num_processors) && tids[i]; ++i) {
dkprintf("%s: tid[%d]: %d\n", __FUNCTION__, i, tids[i]);
newproc->tids[i].tid = tids[i];
newproc->tids[i].thread = NULL;
++newproc->nr_tids;
}
kfree(tids);
}
/* Find an unused TID */
retry_tid:
for (i = 0; i < newproc->nr_tids; ++i) {
if (!newproc->tids[i].thread) {
if (!__sync_bool_compare_and_swap(
&newproc->tids[i].thread, NULL, new)) {
goto retry_tid;
}
new->tid = newproc->tids[i].tid;
dkprintf("%s: tid %d assigned to %p\n", __FUNCTION__, new->tid, new);
break;
}
}
/* TODO: spawn more mcexec threads */
if (!new->tid) {
kprintf("%s: no more TIDs available\n");
panic("");
}
mcs_rwlock_writer_unlock(&newproc->threads_lock, &lock);
}
/* fork() a new process on the host */
else {
@ -1873,7 +1982,7 @@ unsigned long do_fork(int clone_flags, unsigned long newsp,
}
/* In a single threaded process TID equals to PID */
settid(new, 0, cpuid, -1);
new->tid = newproc->pid;
new->vm->address_space->pids[0] = new->proc->pid;
dkprintf("fork(): new pid: %d\n", new->proc->pid);
@ -2242,9 +2351,8 @@ SYSCALL_DECLARE(setfsgid)
unsigned long newfsgid;
struct syscall_request request IHK_DMA_ALIGN;
request.number = __NR_setfsuid;
request.number = __NR_setfsgid;
request.args[0] = fsgid;
request.args[1] = 0;
newfsgid = do_syscall(&request, ihk_mc_get_processor_id(), 0);
do_setresgid();
return newfsgid;
@ -2470,6 +2578,31 @@ SYSCALL_DECLARE(close)
return rc;
}
SYSCALL_DECLARE(fcntl)
{
int fd = ihk_mc_syscall_arg0(ctx);
// int cmd = ihk_mc_syscall_arg1(ctx);
long rc;
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
struct mckfd *fdp;
long irqstate;
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
for(fdp = proc->mckfd; fdp; fdp = fdp->next)
if(fdp->fd == fd)
break;
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
if(fdp && fdp->fcntl_cb){
rc = fdp->fcntl_cb(fdp, ctx);
}
else{
rc = syscall_generic_forwarding(__NR_fcntl, ctx);
}
return rc;
}
SYSCALL_DECLARE(rt_sigprocmask)
{
int how = ihk_mc_syscall_arg0(ctx);
@ -2621,26 +2754,12 @@ perf_counter_alloc(struct mc_perf_event *event)
struct perf_event_attr *attr = &event->attr;
struct mc_perf_event *leader = event->group_leader;
if(attr->type == PERF_TYPE_HARDWARE) {
event->counter_id = ihk_mc_perfctr_alloc_counter(leader->pmc_status);
} else if(attr->type == PERF_TYPE_RAW) {
// PAPI_REF_CYC counted by fixed counter
if((attr->config & 0x0000ffff) == 0x00000300) {
event->counter_id = 2 + X86_IA32_BASE_FIXED_PERF_COUNTERS;
return ret;
}
event->counter_id = ihk_mc_perfctr_alloc_counter(leader->pmc_status);
} else {
// Not supported type.
ret = -1;
}
ret = ihk_mc_perfctr_alloc_counter(&attr->type, &attr->config, leader->pmc_status);
if(ret >= 0) {
leader->pmc_status |= 1UL << event->counter_id;
leader->pmc_status |= 1UL << ret;
}
event->counter_id = ret;
return ret;
}
@ -2649,7 +2768,6 @@ int
perf_counter_start(struct mc_perf_event *event)
{
int ret = 0;
enum ihk_perfctr_type type;
struct perf_event_attr *attr = &event->attr;
int mode = 0x00;
@ -2660,52 +2778,34 @@ perf_counter_start(struct mc_perf_event *event)
mode |= PERFCTR_USER_MODE;
}
if(attr->type == PERF_TYPE_HARDWARE) {
switch(attr->config){
case PERF_COUNT_HW_CPU_CYCLES :
type = APT_TYPE_CYCLE;
break;
case PERF_COUNT_HW_INSTRUCTIONS :
type = APT_TYPE_INSTRUCTIONS;
break;
default :
// Not supported config.
type = PERFCTR_MAX_TYPE;
}
ret = ihk_mc_perfctr_init(event->counter_id, type, mode);
ihk_mc_perfctr_set(event->counter_id, event->sample_freq * -1);
ihk_mc_perfctr_start(1UL << event->counter_id);
} else if(attr->type == PERF_TYPE_RAW) {
// PAPI_REF_CYC counted by fixed counter
if(event->counter_id >= X86_IA32_BASE_FIXED_PERF_COUNTERS) {
ret = ihk_mc_perfctr_fixed_init(event->counter_id, mode);
ihk_mc_perfctr_set(event->counter_id, event->sample_freq * -1);
ihk_mc_perfctr_start(1UL << event->counter_id);
return ret;
}
if(event->counter_id >= 0 && event->counter_id < X86_IA32_NUM_PERF_COUNTERS) {
ret = ihk_mc_perfctr_init_raw(event->counter_id, attr->config, mode);
ihk_mc_perfctr_set(event->counter_id, event->sample_freq * -1);
ihk_mc_perfctr_start(1UL << event->counter_id);
} else {
// Not supported type.
}
else if(event->counter_id >= X86_IA32_BASE_FIXED_PERF_COUNTERS &&
event->counter_id < X86_IA32_BASE_FIXED_PERF_COUNTERS + X86_IA32_NUM_FIXED_PERF_COUNTERS) {
ret = ihk_mc_perfctr_fixed_init(event->counter_id, mode);
ihk_mc_perfctr_start(1UL << event->counter_id);
}
else {
ret = -1;
}
return ret;
}
unsigned long perf_event_read_value(struct mc_perf_event *event)
{
unsigned long rtn_count = 0;
unsigned long pmc_count = 0;
int counter_id = event->counter_id;
if(event->pid == 0)
event->count = ihk_mc_perfctr_read(counter_id);
if(event->pid == 0) {
pmc_count = ihk_mc_perfctr_read(counter_id) + event->attr.sample_freq;
pmc_count &= 0x000000ffffffffffL; // 40bit MASK
}
rtn_count += event->count;
rtn_count += event->count + pmc_count;
if(event->attr.inherit)
rtn_count += event->child_count_total;
@ -2922,11 +3022,21 @@ perf_ioctl(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
break;
case PERF_EVENT_IOC_RESET:
// TODO: reset other process
ihk_mc_perfctr_reset(counter_id);
ihk_mc_perfctr_set(counter_id, event->attr.sample_freq * -1);
event->count = 0L;
break;
case PERF_EVENT_IOC_REFRESH:
// TODO: refresh other process
ihk_mc_perfctr_set(counter_id, event->sample_freq * -1);
// not supported on inherited events
if(event->attr.inherit)
return -EINVAL;
event->count += event->attr.sample_freq;
ihk_mc_perfctr_set(counter_id, event->attr.sample_freq * -1);
perf_start(event);
break;
default :
return -1;
@ -2945,6 +3055,28 @@ perf_close(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
return 0;
}
static int
perf_fcntl(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
{
int cmd = ihk_mc_syscall_arg1(ctx);
long arg = ihk_mc_syscall_arg2(ctx);
int rc = 0;
switch(cmd) {
case 10: // F_SETSIG
sfd->sig_no = arg;
break;
case 0xf: // F_SETOWN_EX
break;
default :
break;
}
rc = syscall_generic_forwarding(__NR_fcntl, ctx);
return rc;
}
static long
perf_mmap(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
{
@ -2963,6 +3095,7 @@ perf_mmap(struct mckfd *sfd, ihk_mc_user_context_t *ctx)
// setup perf_event_mmap_page
page = (struct perf_event_mmap_page *)rc;
page->data_head = 16;
page->cap_user_rdpmc = 1;
return rc;
@ -3014,7 +3147,7 @@ SYSCALL_DECLARE(perf_event_open)
event->sample_freq = attr->sample_freq;
event->nr_siblings = 0;
event->count = 0;
event->count = 0L;
event->child_count_total = 0;
event->parent = NULL;
event->pid = pid;
@ -3050,10 +3183,12 @@ SYSCALL_DECLARE(perf_event_open)
if(!sfd)
return -ENOMEM;
sfd->fd = fd;
sfd->sig_no = -1;
sfd->read_cb = perf_read;
sfd->ioctl_cb = perf_ioctl;
sfd->close_cb = perf_close;
sfd->mmap_cb = perf_mmap;
sfd->fcntl_cb = perf_fcntl;
sfd->data = (long)event;
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
@ -3413,6 +3548,90 @@ SYSCALL_DECLARE(mincore)
return 0;
} /* sys_mincore() */
static int
set_memory_range_flag(struct vm_range *range, unsigned long arg)
{
range->flag |= arg;
return 0;
}
static int
clear_memory_range_flag(struct vm_range *range, unsigned long arg)
{
range->flag &= ~arg;
return 0;
}
static int
change_attr_process_memory_range(struct process_vm *vm,
uintptr_t start, uintptr_t end,
int (*change_proc)(struct vm_range *,
unsigned long),
unsigned long arg)
{
uintptr_t addr;
int error;
struct vm_range *range;
struct vm_range *prev;
struct vm_range *next;
int join_flag = 0;
error = 0;
range = lookup_process_memory_range(vm, start, start + PAGE_SIZE);
if(!range){
error = -ENOMEM;
goto out;
}
prev = previous_process_memory_range(vm, range);
if(!prev)
prev = range;
for (addr = start; addr < end; addr = range->end) {
if (range->start < addr) {
if((error = split_process_memory_range(vm, range, addr, &range))) {
break;
}
}
if (end < range->end) {
if((error = split_process_memory_range(vm, range, end, NULL))) {
break;
}
}
if(!(error = change_proc(range, arg))){
break;
}
range = next_process_memory_range(vm, range);
}
if(error){
next = next_process_memory_range(vm, range);
if(!next)
next = range;
}
else{
next = range;
}
while(prev != next){
int wkerr;
range = next_process_memory_range(vm, prev);
if(!range)
break;
wkerr = join_process_memory_range(vm, prev, range);
if(range == next)
join_flag = 1;
if (wkerr) {
if(join_flag)
break;
prev = range;
}
}
out:
return error;
}
SYSCALL_DECLARE(madvise)
{
const uintptr_t start = (uintptr_t)ihk_mc_syscall_arg0(ctx);
@ -3521,6 +3740,7 @@ SYSCALL_DECLARE(madvise)
goto out;
}
}
else if(advice == MADV_DONTFORK || advice == MADV_DOFORK);
else if (!range->memobj || !memobj_has_pager(range->memobj)) {
dkprintf("[%d]sys_madvise(%lx,%lx,%x):has not pager"
"[%lx-%lx) %lx\n",
@ -3565,6 +3785,27 @@ SYSCALL_DECLARE(madvise)
}
}
if(advice == MADV_DONTFORK){
error = change_attr_process_memory_range(thread->vm, start, end,
set_memory_range_flag,
VR_DONTFORK);
if(error){
goto out;
}
}
if(advice == MADV_DOFORK){
error = change_attr_process_memory_range(thread->vm, start, end,
clear_memory_range_flag,
VR_DONTFORK);
if(error){
goto out;
}
}
if(advice == MADV_DONTFORK ||
advice == MADV_DOFORK){
error = syscall_generic_forwarding(__NR_madvise, ctx);
}
error = 0;
out:
ihk_mc_spinlock_unlock_noirq(&thread->vm->memory_range_lock);
@ -5540,6 +5781,10 @@ SYSCALL_DECLARE(sched_setaffinity)
int empty_set = 1;
extern int num_processors;
if (!u_cpu_set) {
return -EINVAL;
}
if (sizeof(k_cpu_set) > len) {
memset(&k_cpu_set, 0, sizeof(k_cpu_set));
}
@ -5547,7 +5792,7 @@ SYSCALL_DECLARE(sched_setaffinity)
len = MIN2(len, sizeof(k_cpu_set));
if (copy_from_user(&k_cpu_set, u_cpu_set, len)) {
kprintf("%s:%d copy_from_user failed.\n", __FILE__, __LINE__);
dkprintf("%s: error: copy_from_user failed for %p:%d\n", __FUNCTION__, u_cpu_set, len);
return -EFAULT;
}

View File

@ -75,7 +75,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n",
ops, instance, mode, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_createf:allocate_pages failed. %d\n", error);
@ -134,7 +134,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n",
@ -156,7 +156,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error);
@ -208,7 +208,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error);
@ -229,7 +229,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error);
@ -279,7 +279,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n",
@ -301,7 +301,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error);
@ -353,7 +353,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error);
@ -374,7 +374,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error);
@ -423,7 +423,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error);
@ -601,14 +601,14 @@ sysfs_init(void)
}
sysfs_data_bufsize = PAGE_SIZE;
sysfs_data_buf = allocate_pages(1, IHK_MC_AP_NOWAIT);
sysfs_data_buf = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!sysfs_data_buf) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error);
goto out;
}
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(param) failed. %d\n",
@ -644,7 +644,7 @@ sysfs_init(void)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_init(): %d\n", error);

View File

@ -172,6 +172,10 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align,
struct zeroobj *obj = to_zeroobj(memobj);
struct page *page;
/* Don't bother about zero page, page fault handler will
* allocate and clear pages */
return 0;
dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n",
memobj, off, p2align, physp);
if (off & ~PAGE_MASK) {

View File

@ -103,7 +103,7 @@ void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages);
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag);
void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag);

View File

@ -54,11 +54,11 @@ int ihk_mc_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_stop(unsigned long counter_mask);
int ihk_mc_perfctr_fixed_init(int counter, int mode);
int ihk_mc_perfctr_reset(int counter);
int ihk_mc_perfctr_set(int counter, unsigned long value);
int ihk_mc_perfctr_set(int counter, long value);
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value);
unsigned long ihk_mc_perfctr_read(int counter);
unsigned long ihk_mc_perfctr_read_msr(int counter);
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status);
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status);
#endif