Compare commits

...

80 Commits
1.1.2 ... 1.2.2

Author SHA1 Message Date
ddc33821cf sched_yield(): avoid schedule for single thread 2016-12-05 18:10:20 +09:00
0ab7d02994 disable syscall tracker and eliminate interrupt_syscall debug msg 2016-12-05 18:10:20 +09:00
a8c4ab221b use MCS locks in signal handling code 2016-12-05 18:10:20 +09:00
87d36a7752 mcreboot-smp-x86: -t to enable turbo boost 2016-12-05 18:10:20 +09:00
998ded414c mcreboot-smp-x86: shorter sleep in waiting for /proc 2016-12-05 18:10:20 +09:00
f78d031e64 syscall and offload tracking (disabled by default) 2016-12-05 18:10:20 +09:00
4ab37dd34a schedule(): only load page table during context switch if it's different 2016-12-05 18:10:20 +09:00
8129dec2f7 Fix out-of-tree build
<build>/ihk/cokernel/Makefile.common is not found when
<build>/mckernel/kernel/Makfile tries to perform
"Make -C <build>/ihk/{cokernel,ikc}" from mckernel/kernel
2016-12-01 16:44:01 +09:00
a1035a1878 fix out of tree build 2016-12-01 12:55:34 +09:00
db169c5f90 add gcc options (-ffreestanding -fno-tree-loop-distribute-patterns)
refs #299
2016-11-29 16:28:18 +09:00
bbb55ef261 sched_setparam: thread lock is necessary when update other thread data 2016-11-28 14:04:44 +09:00
1130cafe41 ptrace: fixed for threads. 2016-11-28 11:19:30 +09:00
a1cf27e232 sched_getaffinity(): fix error code for special invalid input 2016-11-28 05:50:01 +09:00
5a1ce99d87 mcexec: fix number of threads not to exceed thread_data array 2016-11-27 07:31:52 +09:00
c7db296e1b getcpu(): expose correct NUMA id 2016-11-26 09:29:09 +09:00
f634a750c5 sched_{set/get}affinity(): fix error codes (also fixes KMP_AFFINITY behavior) 2016-11-24 21:25:16 +09:00
d07a196c8e mcexec: enable the same number of threads as CPU cores 2016-11-24 16:40:52 +09:00
8c56c75d2c process_vm_read_writev(): fix base address check for EFAULT 2016-11-24 10:40:41 +09:00
e54895efde set_mempolicy(): debug msg 2016-11-23 08:53:26 +09:00
2f8cca2d6d memcpy(): faster version using ASM rep; movsl 2016-11-23 08:51:22 +09:00
64607152ee VM: introduction of range lookup cache 2016-11-23 08:48:44 +09:00
20383ad3d0 do_process_vm_read_writev(): page size awareness optimization 2016-11-23 08:47:32 +09:00
787d34f650 introduction of ihk_mc_pt_virt_to_phys_size() 2016-11-23 08:40:33 +09:00
ae618a0c68 mcexec: remount /proc in mcexec's file NS after exec() 2016-11-22 13:22:59 +09:00
f480376153 mcoverlayfs: supported Linux kernel 4.6
add mcoverlayfs(linux-4.6.7 base)
2016-11-17 18:09:27 +09:00
e4b3a88fc6 mcexec_sys_umount(): remove debug print 2016-11-10 15:05:45 +09:00
69a5c53074 NUMA: hide non-existing nodes from /sys/devices/system/node listing 2016-11-05 16:12:08 +09:00
259583e936 mcreboot-smp-x86.sh: more white out of invalid NUMA info 2016-11-05 13:35:53 +09:00
0f826290d0 NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation 2016-11-05 13:32:02 +09:00
e46f027894 mcexec/mcctrl: unmount cgroups (privately) which expose invalid NUMA info 2016-11-04 17:02:48 +09:00
3e093f6a40 sysfs: fix /sys/devices/system/node/online value 2016-11-03 16:10:29 +09:00
00996b551f mcreboot: white out non-existing NUMA information 2016-11-03 16:09:27 +09:00
24d8697cef mcexec: workaround for overlayed /sys FS directory lseek() bug
lseek() on directories under /sys filesystem that are part of an
overlayed filesystem behave differently than in the original /sys.
This causes segfault in libnuma when discovering topology
information. The patch fakes return value as it is supposed to be,
which also fixes the Intel MPI 2017 MPI_Init() crash.
2016-11-03 13:41:25 +09:00
be4f6741f9 sysfs: fix /sys/devices/system/cpu/cpuXX/online value 2016-11-03 13:39:21 +09:00
7a2f67f5f0 sysfs: eliminate unnecessary new line from /sys/devices/system/node/nodeX/distance 2016-11-03 13:37:53 +09:00
bba0425267 sysfs: fix /sys/devices/system/cpu/online value 2016-11-03 13:36:29 +09:00
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
59 changed files with 8960 additions and 1005 deletions

View File

@ -148,7 +148,7 @@ extern char page_fault[], general_protection_exception[];
extern char debug_exception[], int3_exception[];
uint64_t boot_pat_state = 0;
int no_turbo = 0; /* May be updated by early parsing of kargs */
int no_turbo = 1; /* May be updated by early parsing of kargs */
extern int num_processors; /* kernel/ap.c */
struct pvclock_vsyscall_time_info *pvti = NULL;
@ -1079,6 +1079,10 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
kprintf_unlock(irqflags);
if (!(error & PF_USER)) {
panic("panic: kernel mode PF");
}
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
@ -1736,7 +1740,7 @@ int arch_setup_pvclock(void)
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
@ -1766,44 +1770,6 @@ void arch_start_pvclock(void)
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {

View File

@ -13,6 +13,8 @@
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;

View File

@ -306,7 +306,7 @@ struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);

View File

@ -0,0 +1,23 @@
#ifndef _ASM_X86_STRING_H
#define _ASM_X86_STRING_H
#define ARCH_FAST_MEMCPY
static inline void *__inline_memcpy(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
: "memory");
return to;
}
#endif

View File

@ -150,5 +150,8 @@ SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef TRACK_SYSCALLS
SYSCALL_HANDLED(701, syscall_offload_clr_cntrs)
#endif // TRACK_SYSCALLS
/**** End of File ****/

View File

@ -31,11 +31,10 @@
static char *last_page;
extern char _head[], _end[];
static struct ihk_mc_pa_ops *pa_ops;
extern unsigned long x86_kernel_phys_base;
void *early_alloc_page(void)
/* Arch specific early allocation routine */
void *early_alloc_pages(int nr_pages)
{
void *p;
@ -48,41 +47,14 @@ void *early_alloc_page(void)
panic("Early allocator is already finalized. Do not use it.\n");
}
p = last_page;
last_page += PAGE_SIZE;
last_page += (nr_pages * PAGE_SIZE);
return p;
}
void *arch_alloc_page(enum ihk_mc_ap_flag flag)
void early_alloc_invalidate(void)
{
if (pa_ops)
return pa_ops->alloc_page(1, PAGE_P2ALIGN, flag);
else
return early_alloc_page();
}
void arch_free_page(void *ptr)
{
if (pa_ops)
pa_ops->free_page(ptr, 1);
}
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
{
if (pa_ops)
return pa_ops->alloc_page(npages, p2align, flag);
else
return NULL;
}
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
{
return ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
void ihk_mc_free_pages(void *p, int npages)
{
if (pa_ops)
pa_ops->free_page(p, npages);
last_page = (void *)-1;
}
void *ihk_mc_allocate(int size, int flag)
@ -175,7 +147,7 @@ static unsigned long setup_l3(struct page_table *pt,
pt->entry[i] = 0;
continue;
}
pt_phys = setup_l2(arch_alloc_page(IHK_MC_AP_CRITICAL), phys, start, end);
pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end);
pt->entry[i] = pt_phys | PFL3_PDIR_ATTR;
}
@ -199,7 +171,7 @@ static void init_normal_area(struct page_table *pt)
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
phys += PTL4_SIZE) {
pt_phys = setup_l3(arch_alloc_page(IHK_MC_AP_CRITICAL), phys,
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
map_start, map_end);
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
@ -209,7 +181,7 @@ static void init_normal_area(struct page_table *pt)
static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag)
{
struct page_table *newpt = arch_alloc_page(ap_flag);
struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag);
if(newpt)
memset(newpt, 0, sizeof(struct page_table));
@ -518,8 +490,10 @@ uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt)
return pagemap;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size)
{
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
@ -541,6 +515,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l3idx] & PFL3_SIZE)) {
*phys = pte_get_phys(&pt->entry[l3idx])
| (v & (PTL3_SIZE - 1));
if (size) *size = PTL3_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx]));
@ -551,6 +526,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l2idx] & PFL2_SIZE)) {
*phys = pte_get_phys(&pt->entry[l2idx])
| (v & (PTL2_SIZE - 1));
if (size) *size = PTL2_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx]));
@ -560,9 +536,17 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
}
*phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1));
if (size) *size = PTL1_SIZE;
return 0;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
{
return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL);
}
int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
{
int l4idx, l3idx, l2idx, l1idx;
@ -718,7 +702,7 @@ static void destroy_page_table(int level, struct page_table *pt)
}
}
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
return;
}
@ -1112,7 +1096,7 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
@ -1161,7 +1145,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
}
@ -1181,7 +1165,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1226,7 +1210,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL3_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE);
}
args->vm->currss -= PTL3_SIZE;
@ -1245,7 +1229,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1596,7 +1580,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -1679,7 +1663,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l3(%lx,%lx,%lx): %d\n",
base, start, end, error, *ptep);
@ -1737,7 +1721,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -2094,7 +2078,7 @@ static void init_vsyscall_area(struct page_table *pt)
void init_page_table(void)
{
check_available_page_size();
init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL);
init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL);
ihk_mc_spinlock_init(&init_pt_lock);
memset(init_pt, 0, sizeof(PAGE_SIZE));
@ -2111,27 +2095,27 @@ void init_page_table(void)
}
extern void __reserve_arch_pages(unsigned long, unsigned long,
void (*)(unsigned long, unsigned long, int));
void (*)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int))
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int))
{
/* Reserve Text + temporal heap */
cb(virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
/* Reserve trampoline area to boot the second ap */
cb(ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
/* Reserve the null page */
cb(0, PAGE_SIZE, 0);
/* Micro-arch specific */
cb(pa_allocator, 0, PAGE_SIZE, 0);
/*
* Micro-arch specific
* TODO: this does nothing in SMP mode, update it for KNC if necessary
*/
__reserve_arch_pages(start, end, cb);
}
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
{
last_page = (void *)-1;
pa_ops = ops;
}
unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;
@ -2383,8 +2367,18 @@ int write_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
return error;
}
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, from: %p,"
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(va, from, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
}
from += cpsize;
to += cpsize;

View File

@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@ -23,17 +24,23 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;

View File

@ -544,14 +544,14 @@ void ptrace_report_signal(struct thread *thread, int sig)
int parent_pid;
struct siginfo info;
dkprintf("ptrace_report_signal,pid=%d\n", thread->proc->pid);
dkprintf("ptrace_report_signal, tid=%d, pid=%d\n", thread->tid, thread->proc->pid);
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
if(!(proc->ptrace & PT_TRACED)){
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
return;
}
proc->exit_status = sig;
thread->exit_status = sig;
/* Transition thread state */
proc->status = PS_TRACED;
thread->status = PS_TRACED;
@ -569,8 +569,8 @@ void ptrace_report_signal(struct thread *thread, int sig)
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_TRAPPED;
info._sifields._sigchld.si_pid = thread->proc->pid;
info._sifields._sigchld.si_status = thread->proc->exit_status;
info._sifields._sigchld.si_pid = thread->tid;
info._sifields._sigchld.si_status = thread->exit_status;
do_kill(cpu_local_var(current), parent_pid, -1, SIGCHLD, &info, 0);
/* Wake parent (if sleeping in wait4()) */
waitq_wakeup(&proc->parent->waitpid_q);
@ -695,10 +695,10 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
int orgsig;
int ptraceflag = 0;
struct mcs_rwlock_node_irqsave lock;
unsigned long irqstate;
struct mcs_rwlock_node_irqsave mcs_rw_node;
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
dkprintf("do_signal,pid=%d,sig=%d\n", proc->pid, sig);
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
orgsig = sig;
if((proc->ptrace & PT_TRACED) &&
@ -718,12 +718,12 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
rc = regs->gpr.rax;
}
irqstate = ihk_mc_spinlock_lock(&thread->sigcommon->lock);
mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node);
k = thread->sigcommon->action + sig - 1;
if(k->sa.sa_handler == SIG_IGN){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
return;
}
else if(k->sa.sa_handler){
@ -808,7 +808,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig);
return;
@ -827,7 +827,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
if(!(k->sa.sa_flags & SA_NODEFER))
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
if(regs->gpr.rflags & RFLAGS_TF){
struct siginfo info;
@ -853,7 +853,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
}
else
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
switch (sig) {
case SIGSTOP:
case SIGTSTP:
@ -885,7 +885,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Wake up the parent who tried wait4 and sleeping */
waitq_wakeup(&proc->parent->waitpid_q);
dkprintf("do_signal,SIGSTOP,sleeping\n");
dkprintf("do_signal(): pid: %d, tid: %d SIGSTOP, sleeping\n",
proc->pid, thread->tid);
/* Sleep */
schedule();
dkprintf("SIGSTOP(): woken up\n");
@ -899,7 +900,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Update thread state in fork tree */
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
proc->exit_status = SIGTRAP;
thread->exit_status = SIGTRAP;
proc->status = PS_TRACED;
thread->status = PS_TRACED;
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
@ -953,11 +954,11 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
static struct sig_pending *
getsigpending(struct thread *thread, int delflag){
struct list_head *head;
ihk_spinlock_t *lock;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
int irqstate;
__sigset_t x;
int sig;
struct k_sigaction *k;
@ -966,8 +967,12 @@ getsigpending(struct thread *thread, int delflag){
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for(;;){
irqstate = ihk_mc_spinlock_lock(lock);
for(;;) {
if (delflag)
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list){
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
@ -976,17 +981,26 @@ getsigpending(struct thread *thread, int delflag){
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)){
if(!(pending->sigmask.__val[0] & w)){
if(delflag)
if(delflag)
list_del(&pending->list);
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
return pending;
}
}
}
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if(lock == &thread->sigpendinglock)
return NULL;
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
@ -1034,22 +1048,25 @@ check_signal(unsigned long rc, void *regs0, int num)
}
}
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
return;
goto out;
}
if(regs != NULL && !interrupt_from_user(regs)) {
return;
goto out;
}
for(;;){
pending = getsigpending(thread, 1);
if(!pending) {
dkprintf("check_signal,queue is empty\n");
return;
goto out;
}
do_signal(rc, regs, thread, pending, num);
}
out:
return;
}
unsigned long
@ -1063,7 +1080,8 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
struct thread *tthread = NULL;
int i;
__sigset_t mask;
ihk_spinlock_t *savelock = NULL;
mcs_rwlock_lock_t *savelock = NULL;
struct mcs_rwlock_node mcs_rw_node;
struct list_head *head = NULL;
int rc;
unsigned long irqstate = 0;
@ -1247,7 +1265,7 @@ done:
doint = 0;
ihk_mc_spinlock_lock_noirq(savelock);
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
/* Put signal event even when handler is SIG_IGN or SIG_DFL
because target ptraced thread must call ptrace_report_signal
@ -1286,7 +1304,7 @@ done:
}
}
}
ihk_mc_spinlock_unlock_noirq(savelock);
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
cpu_restore_interrupt(irqstate);
if (doint && !(mask & tthread->sigmask.__val[0])) {

View File

@ -3,13 +3,13 @@
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
@ -23,18 +23,25 @@ ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
irqbalance_used="yes"
else
irqbalance_used="no"
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
turbo=""
while getopts :ti:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
@ -71,11 +78,110 @@ do
;;
m) mem=${OPTARG}
;;
t) turbo="turbo"
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl || echo "warning: failed to remove mcctrl" >&2
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
@ -84,13 +190,20 @@ minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
if [ ${linux_version_code} -ge 263680 -a ${linux_version_code} -lt 263936 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
@ -98,6 +211,7 @@ if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
@ -105,7 +219,10 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
@ -116,135 +233,243 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
if ! rmmod mcoverlay; then
echo "error: removing mcoverlay" >&2
error_exit "initial"
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
error_exit "initial"
fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
if ! insmod ${KMODDIR}/ihk.ko; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "cpus_reserved"
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then
echo "error: inserting mcctrl.ko" >&2
error_exit "mem_reserved"
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE} $turbo"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
# mcklogd survives when McKernel isn't shut down by mcstop+release.sh
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -22,34 +22,76 @@ if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl" >&2; exit 1; fi
if ! rmmod mcctrl; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86" >&2; exit 1; fi
if ! rmmod ihk_smp_x86; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then echo "error: removing ihk" >&2; exit 1; fi
if ! rmmod ihk; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
@ -57,8 +99,17 @@ pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi

30
configure vendored
View File

@ -2922,6 +2922,7 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $
ac_compiler_gnu=$ac_cv_c_compiler_gnu
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
@ -3117,6 +3118,31 @@ _ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_umount" >&5
$as_echo_n "checking System.map for symbol sys_umount... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_umount\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
$as_echo "not found" >&6; }
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_sys_umount\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $mcctrl_result" >&5
$as_echo "$mcctrl_result" >&6; }
cat >>confdefs.h <<_ACEOF
#define MCCTRL_KSYM_sys_umount $mcctrl_addr
_ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_unshare" >&5
$as_echo_n "checking System.map for symbol sys_unshare... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_unshare\$" | cut -d\ -f1`
@ -3887,11 +3913,12 @@ fi
ac_config_headers="$ac_config_headers executer/config.h"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
if test "x$enable_dcfa" = xyes; then :
@ -4597,6 +4624,7 @@ do
"executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.0.9/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.0.9/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.6.7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.6.7/Makefile" ;;
"kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
"kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;;
"arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;;

View File

@ -70,6 +70,7 @@ case $WITH_TARGET in
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
@ -221,6 +222,7 @@ AC_DEFUN([MCCTRL_FIND_KSYM],[
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_umount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
@ -285,6 +287,7 @@ AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(CFLAGS)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
@ -304,6 +307,7 @@ AC_CONFIG_FILES([
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh

View File

@ -51,6 +51,9 @@
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_umount, or 0 if exported */
#undef MCCTRL_KSYM_sys_umount
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare

View File

@ -41,6 +41,7 @@
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@ -49,7 +50,8 @@
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_DEBUG_LOG 0x40000000
@ -196,6 +198,10 @@ struct sys_mount_desc {
void *data;
};
struct sys_umount_desc {
char *dir_name;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};

View File

@ -75,7 +75,7 @@ static int load_elf(struct linux_binprm *bprm
char buf[32];
int l;
int pass;
char pbuf[1024];
char *pbuf;
const char *path;
if(bprm->envc == 0)
@ -88,6 +88,11 @@ static int load_elf(struct linux_binprm *bprm
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
@ -96,8 +101,10 @@ static int load_elf(struct linux_binprm *bprm
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
@ -124,8 +131,10 @@ static int load_elf(struct linux_binprm *bprm
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
@ -199,21 +208,27 @@ static int load_elf(struct linux_binprm *bprm
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
@ -221,12 +236,14 @@ static int load_elf(struct linux_binprm *bprm
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
@ -236,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs

View File

@ -66,7 +66,18 @@ int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = sys_mount;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_umount
#if MCCTRL_KSYM_sys_umount
typedef int (*int_fn_char_star_int_t)(char *, int);
int (*mcctrl_sys_umount)(char *dir_name, int flags) =
(int_fn_char_star_int_t)
MCCTRL_KSYM_sys_umount;
#else // exported
int (*mcctrl_sys_umount)(char *dir_name, int flags) = sys_umount;
#endif
#endif
@ -77,32 +88,49 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
static long mcexec_prepare_image(ihk_os_t os,
struct program_load_desc * __user udesc)
{
struct program_load_desc desc, *pdesc;
struct program_load_desc *desc, *pdesc;
struct ikc_scd_packet isp;
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int num_sections;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
printk("%s: error: copying program_load_desc\n",
__FUNCTION__);
kfree(desc);
return -EFAULT;
}
if (desc.num_sections <= 0 || desc.num_sections > 16) {
printk("# of sections: %d\n", desc.num_sections);
num_sections = desc->num_sections;
if (num_sections <= 0 || num_sections > 16) {
printk("# of sections: %d\n", num_sections);
return -EINVAL;
}
pdesc = kmalloc(sizeof(struct program_load_desc) +
sizeof(struct program_image_section)
* desc.num_sections, GFP_KERNEL);
memcpy(pdesc, &desc, sizeof(struct program_load_desc));
* num_sections, GFP_KERNEL);
memcpy(pdesc, desc, sizeof(struct program_load_desc));
if (copy_from_user(pdesc->sections, udesc->sections,
sizeof(struct program_image_section)
* desc.num_sections)) {
* num_sections)) {
kfree(desc);
kfree(pdesc);
return -EFAULT;
}
kfree(desc);
pdesc->pid = task_tgid_vnr(current);
if (reserve_user_space(usrdata, &pdesc->user_start, &pdesc->user_end)) {
@ -158,7 +186,7 @@ static long mcexec_prepare_image(ihk_os_t os,
ppd->rpgtable = pdesc->rpgtable;
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
sizeof(struct program_image_section) * num_sections)) {
ret = -EFAULT;
goto free_out;
}
@ -315,33 +343,42 @@ static long mcexec_start_image(ihk_os_t os,
struct program_load_desc * __user udesc,
struct file *file)
{
struct program_load_desc desc;
struct program_load_desc *desc;
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct handlerinfo *info;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
kfree(desc);
return -EFAULT;
}
info = kmalloc(sizeof(struct handlerinfo), GFP_KERNEL);
info->pid = desc.pid;
info->pid = desc->pid;
ihk_os_register_release_handler(file, release_handler, info);
c = usrdata->channels + desc.cpu;
c = usrdata->channels + desc->cpu;
mcctrl_ikc_set_recv_cpu(os, desc.cpu);
mcctrl_ikc_set_recv_cpu(os, desc->cpu);
usrdata->last_thread_exec = desc.cpu;
usrdata->last_thread_exec = desc->cpu;
isp.msg = SCD_MSG_SCHEDULE_PROCESS;
isp.ref = desc.cpu;
isp.arg = desc.rprocess;
isp.ref = desc->cpu;
isp.arg = desc->rprocess;
mcctrl_ikc_send(os, desc.cpu, &isp);
mcctrl_ikc_send(os, desc->cpu, &isp);
kfree(desc);
return 0;
}
@ -413,6 +450,16 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus;
}
static long mcexec_get_nodes(ihk_os_t os)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata || !usrdata->mem_info)
return -EINVAL;
return usrdata->mem_info->n_numa_nodes;
}
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
@ -501,9 +548,10 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return 0;
kprintf("%s: ERROR: no per-process structure for PID %d, "
"syscall nr: %lu\n",
__FUNCTION__, pid, packet->req.number);
return -1;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
@ -1127,7 +1175,7 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
#ifdef MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
@ -1140,6 +1188,36 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
return ret;
}
long mcexec_sys_umount(struct sys_mount_desc *__user arg)
{
struct sys_umount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#ifdef MCCTRL_KSYM_sys_umount
ret = mcctrl_sys_umount(desc.dir_name, MNT_FORCE);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
@ -1198,6 +1276,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_CPU:
return mcexec_get_cpu(os);
case MCEXEC_UP_GET_NODES:
return mcexec_get_nodes(os);
case MCEXEC_UP_STRNCPY_FROM_USER:
return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg);
@ -1227,6 +1308,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UMOUNT:
return mcexec_sys_umount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);

View File

@ -60,6 +60,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
@ -69,6 +70,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
@ -129,11 +131,15 @@ error_cleanup_channels:
int mcctrl_os_shutdown_notifier(int os_index)
{
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
@ -151,11 +157,16 @@ static struct ihk_os_notifier mcctrl_os_notifier = {
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();

View File

@ -35,6 +35,16 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
@ -99,10 +109,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
@ -168,93 +174,26 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
@ -274,7 +213,7 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -292,7 +231,7 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -315,27 +254,29 @@ static struct ihk_ikc_listen_param listen_param2 = {
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
struct mcctrl_usrdata *usrdata;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("Error: cannot obtain OS CPU and memory information.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
if (usrdata->cpu_info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->num_channels = usrdata->cpu_info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
@ -362,20 +303,7 @@ int prepare_ikc_channels(ihk_os_t os)
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@ -383,6 +311,11 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
@ -393,7 +326,6 @@ void destroy_ikc_channels(ihk_os_t os)
printk("Channel #%d freed.\n", i);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
kfree(usrdata->channels);
kfree(usrdata);

View File

@ -59,8 +59,8 @@
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
@ -172,7 +172,6 @@ struct wait_queue_head_list_node {
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
};
@ -226,11 +225,6 @@ static inline int sysfs_inited(struct sysfsm_data *sdp)
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
@ -239,8 +233,9 @@ struct cache_topology {
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
@ -248,8 +243,12 @@ struct cpu_topology {
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
@ -266,9 +265,7 @@ struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
struct mcctrl_channel *channels;
int remaining_job;
int base_cpu;
int job_pos;
@ -282,10 +279,9 @@ struct mcctrl_usrdata {
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
@ -322,7 +318,7 @@ inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */

View File

@ -481,8 +481,9 @@ procfs_exit(int osnum)
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
if (e) {
delete_procfs_entries(e);
}
up(&procfs_file_list_lock);
}
@ -496,7 +497,7 @@ static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
@ -599,7 +600,7 @@ static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;

View File

@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -1232,9 +1232,16 @@ sysfsm_cleanup(ihk_os_t os)
int error;
ihk_device_t dev = ihk_os_to_dev(os);
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_data *sdp = &udp->sysfsm_data;
struct sysfsm_data *sdp;
struct sysfsm_node *np;
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
sdp = &udp->sysfsm_data;
dprintk("mcctrl:sysfsm_cleanup(%p)\n", os);
if (sdp->sysfs_buf) {
@ -2095,9 +2102,16 @@ struct sysfsm_ops snooping_local_ops_s = {
/**** local list ****/
static ssize_t snooping_local_show_pbl(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pbl() */
struct sysfsm_ops snooping_local_ops_pbl = {
@ -2108,9 +2122,16 @@ struct sysfsm_ops snooping_local_ops_pbl = {
/**** local map ****/
static ssize_t snooping_local_show_pb(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pb() */
struct sysfsm_ops snooping_local_ops_pb = {

View File

@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -92,27 +92,19 @@ void setup_local_snooping_samples(ihk_os_t os)
void setup_local_snooping_files(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_bitmap_param param;
static unsigned long cpu_offline = 0x0;
int i;
int error;
info = ihk_os_get_cpu_info(os);
if (!info) {
eprintk("mcctrl:ihk_os_get_cpu_info failed.\n");
return;
}
memset(udp->cpu_online, 0, sizeof(udp->cpu_online));
for (i = 0; i < info->n_cpus; i++) {
udp->cpu_online[i / BITS_PER_LONG] =
udp->cpu_online[i / BITS_PER_LONG] | (1 << (i % BITS_PER_LONG));
for (i = 0; i < udp->cpu_info->n_cpus; i++) {
set_bit(i, udp->cpu_online);
}
param.nbits = CPU_LONGS * BITS_PER_LONG;
param.ptr = udp->cpu_online;
param.ptr = &udp->cpu_online;
dprintk("mcctrl:setup_local_snooping_files: CPU_LONGS=%d, BITS_PER_LONG=%d\n",
CPU_LONGS, BITS_PER_LONG);
@ -187,141 +179,122 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
return;
} /* free_cpu_topology() */
static void free_cpu_mapping(struct mcctrl_usrdata *udp)
{
ihk_device_t dev = ihk_os_to_dev(udp->os);
size_t size;
size = udp->cpu_mapping_elems * sizeof(struct cpu_mapping);
ihk_device_unmap_virtual(dev, udp->cpu_mapping, size);
ihk_device_unmap_memory(dev, udp->cpu_mapping_pa, size);
return;
} /* free_cpu_mapping() */
void free_topology_info(ihk_os_t os)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
free_node_topology(udp);
free_cpu_topology(udp);
free_cpu_mapping(udp);
return;
} /* free_topology_info() */
void reply_get_cpu_mapping(long req_pa)
/*
* CPU and NUMA node mapping conversion functions.
*/
static int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
struct get_cpu_mapping_req *req = phys_to_virt(req_pa);
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->mapping[cpu_id] : -1;
}
req->busy = 0;
wake_up(&req->wq);
return;
} /* reply_get_cpu_mapping() */
static int get_cpu_mapping(struct mcctrl_usrdata *udp)
static int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
struct get_cpu_mapping_req *req = NULL;
struct ikc_scd_packet packet;
size_t size;
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[cpu_id] : -1;
}
dprintk("get_cpu_mapping(%p)\n", udp);
req = kmalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_mapping:kmalloc failed. %d\n", error);
goto out;
}
req->busy = 1;
req->error = -1;
init_waitqueue_head(&req->wq);
packet.msg = SCD_MSG_GET_CPU_MAPPING;
packet.arg = virt_to_phys(req);
#define GET_CPU_MAPPING_CPU 0
error = mcctrl_ikc_send(udp->os, GET_CPU_MAPPING_CPU, &packet);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"mcctrl_ikc_send failed. %d\n", error);
goto out;
}
error = wait_event_interruptible(req->wq, !req->busy);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"wait_event_interruptible failed. %d\n", error);
req = NULL; /* XXX */
goto out;
}
if (req->error) {
error = req->error;
eprintk("mcctrl:get_cpu_mapping:"
"SCD_MSG_GET_CPU_MAPPING failed. %d\n", error);
goto out;
}
size = req->buf_elems * sizeof(struct cpu_mapping);
udp->cpu_mapping_elems = req->buf_elems;
udp->cpu_mapping_pa = ihk_device_map_memory(dev, req->buf_rpa, size);
udp->cpu_mapping = ihk_device_map_virtual(
dev, udp->cpu_mapping_pa, size, NULL, 0);
error = 0;
out:
dprintk("get_cpu_mapping(%p): %d\n", udp, error);
kfree(req);
return error;
} /* get_cpu_mapping() */
static int hwid_to_cpu(struct mcctrl_usrdata *udp, int hw_id)
static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
int i;
for (i = 0; i < udp->cpu_mapping_elems; ++i) {
if (udp->cpu_mapping[i].hw_id == hw_id) {
return udp->cpu_mapping[i].cpu_number;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->mapping[i] == cpu_id)
return i;
}
return -1;
}
#if 0
static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return i;
}
}
return -1;
}
static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return mckernel_cpu_2_linux_cpu(udp, i);
}
}
return -1;
}
static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
{
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
return (mckernel_cpu >= 0 && mckernel_cpu < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[mckernel_cpu] : -1;
}
#endif
static int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
{
return (numa_id < udp->mem_info->n_numa_nodes) ?
udp->mem_info->numa_mapping[numa_id] : -1;
}
static int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
{
int i;
for (i = 0; i < udp->mem_info->n_numa_nodes; ++i) {
if (udp->mem_info->numa_mapping[i] == numa_id)
return i;
}
return -1;
}
static int translate_cpumap(struct mcctrl_usrdata *udp,
cpumask_t *linmap, cpumask_t *mckmap)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
int lincpu;
int hw_id;
int mckcpu;
dprintk("translate_cpumap(%p,%p,%p)\n", udp, linmap, mckmap);
cpumask_clear(mckmap);
for_each_cpu(lincpu, linmap) {
hw_id = ihk_device_linux_cpu_to_hw_id(dev, lincpu);
if (hw_id < 0) {
error = hw_id;
eprintk("mcctrl:translate_cpumap:"
"ihk_device_linux_cpu_to_hw_id failed."
" %d\n", error);
goto out;
}
mckcpu = linux_cpu_2_mckernel_cpu(udp, lincpu);
mckcpu = hwid_to_cpu(udp, hw_id);
if (mckcpu >= 0) {
cpumask_set_cpu(mckcpu, mckmap);
}
}
error = 0;
out:
dprintk("translate_cpumap(%p,%p,%p): %d\n", udp, linmap, mckmap, error);
return error;
} /* translate_cpumap() */
@ -361,7 +334,7 @@ out:
return (error)? ERR_PTR(error): topo;
} /* get_cache_topology() */
static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
{
int error;
@ -370,41 +343,43 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
struct cache_topology *cache;
struct ihk_cache_topology *saved_cache;
dprintk("get_cpu_topology_one(%p,%d)\n", udp, index);
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
if (!topology) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"kmalloc failed. %d\n", error);
goto out;
}
INIT_LIST_HEAD(&topology->cache_list);
topology->cpu_mapping = &udp->cpu_mapping[index];
topology->mckernel_cpu_id = index;
topology->saved = ihk_device_get_cpu_topology(dev,
mckernel_cpu_2_hw_id(udp, index));
topology->saved = ihk_device_get_cpu_topology(
dev, topology->cpu_mapping->hw_id);
if (IS_ERR(topology->saved)) {
error = PTR_ERR(topology->saved);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"ihk_device_get_cpu_topology failed. %d\n",
error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->core_siblings,
error = translate_cpumap(udp,
&topology->saved->core_siblings,
&topology->core_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(core_siblings) failed."
" %d\n", error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->thread_siblings,
error = translate_cpumap(udp,
&topology->saved->thread_siblings,
&topology->thread_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(thread_siblings) failed."
" %d\n", error);
goto out;
@ -415,7 +390,7 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
cache = get_cache_topology(udp, topology, saved_cache);
if (IS_ERR(cache)) {
error = PTR_ERR(cache);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"get_cache_topology failed. %d\n",
error);
goto out;
@ -429,10 +404,10 @@ out:
if (error && !IS_ERR_OR_NULL(topology)) {
free_cpu_topology_one(udp, topology);
}
dprintk("get_cpu_topology_one(%p,%d): %d %p\n",
dprintk("get_one_cpu_topology(%p,%d): %d %p\n",
udp, index, error, topology);
return (error)? ERR_PTR(error): topology;
} /* get_cpu_topology_one() */
} /* get_one_cpu_topology() */
static int get_cpu_topology(struct mcctrl_usrdata *udp)
{
@ -441,12 +416,12 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
struct cpu_topology *topology;
dprintk("get_cpu_topology(%p)\n", udp);
for (index = 0; index < udp->cpu_mapping_elems; ++index) {
topology = get_cpu_topology_one(udp, index);
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
topology = get_one_cpu_topology(udp, index);
if (IS_ERR(topology)) {
error = PTR_ERR(topology);
eprintk("mcctrl:get_cpu_topology:"
"get_cpu_topology_one failed. %d\n",
eprintk("mcctrl:get_cpu_topology: "
"get_one_cpu_topology failed. %d\n",
error);
goto out;
}
@ -460,15 +435,15 @@ out:
return error;
} /* get_cpu_topology() */
static void setup_one_cache_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu, struct cache_topology *cache)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
int index = cache->saved->index;
struct sysfsm_bitmap_param param;
dprintk("setup_one_cache_files(%p,%p,%p)\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p)\n", udp, cpu, cache);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d64,
&cache->saved->level, 0444,
@ -509,19 +484,19 @@ static void setup_one_cache_files(struct mcctrl_usrdata *udp,
"%s/cpu%d/cache/index%d/shared_cpu_list",
prefix, cpu_number, index);
dprintk("setup_one_cache_files(%p,%p,%p):\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p):\n", udp, cpu, cache);
return;
} /* setup_one_cache_files() */
} /* setup_cpu_sysfs_cache_files() */
static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
struct sysfsm_bitmap_param param;
struct cache_topology *cache;
dprintk("setup_one_cpu_files(%p,%p)\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p)\n", udp, cpu);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
&cpu->saved->physical_package_id, 0444,
@ -553,41 +528,61 @@ static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
prefix, cpu_number);
list_for_each_entry(cache, &cpu->cache_list, chain) {
setup_one_cache_files(udp, cpu, cache);
setup_cpu_sysfs_cache_files(udp, cpu, cache);
}
dprintk("setup_one_cpu_files(%p,%p):\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p):\n", udp, cpu);
return;
} /* setup_one_cpu_files() */
} /* setup_cpu_sysfs_files() */
static void setup_cpu_files(struct mcctrl_usrdata *udp)
static void setup_cpus_sysfs_files_node_link(struct mcctrl_usrdata *udp)
{
int error;
int cpu;
struct sysfs_handle handle;
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
int node = linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)));
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/node/node%d", node);
if (error) {
panic("sysfsm_lookupf: node for CPU");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/cpu/cpu%d/node%d",
cpu, node);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
error = 0;
return;
}
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
{
int error;
struct cpu_topology *cpu;
dprintk("setup_cpu_file(%p)\n", udp);
error = get_cpu_mapping(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
"get_cpu_mapping failed. %d\n", error);
goto out;
}
error = get_cpu_topology(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
eprintk("mcctrl:setup_cpus_sysfs_files:"
"get_cpu_topology failed. %d\n", error);
goto out;
}
list_for_each_entry(cpu, &udp->cpu_topology_list, chain) {
setup_one_cpu_files(udp, cpu);
setup_cpu_sysfs_files(udp, cpu);
}
error = 0;
out:
dprintk("setup_cpu_file(%p):\n", udp);
return;
} /* setup_cpu_files() */
} /* setup_cpus_sysfs_files() */
static struct node_topology *get_one_node_topology(struct mcctrl_usrdata *udp,
struct ihk_node_topology *saved)
@ -629,8 +624,10 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
struct node_topology *topology;
dprintk("get_node_topology(%p)\n", udp);
for (node = 0; ; ++node) {
saved = ihk_device_get_node_topology(dev, node);
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
saved = ihk_device_get_node_topology(dev,
mckernel_numa_2_linux_numa(udp, node));
if (IS_ERR(saved)) {
break;
}
@ -647,6 +644,8 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
goto out;
}
topology->mckernel_numa_id = node;
list_add(&topology->chain, &udp->node_topology_list);
}
@ -659,6 +658,7 @@ out:
static int setup_node_files(struct mcctrl_usrdata *udp)
{
int error;
int node;
struct node_topology *p;
struct sysfsm_bitmap_param param;
@ -670,16 +670,71 @@ static int setup_node_files(struct mcctrl_usrdata *udp)
goto out;
}
memset(&udp->numa_online, 0, sizeof(udp->numa_online));
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
node_set(node, udp->numa_online);
}
param.nbits = MAX_NUMNODES;
param.ptr = &udp->numa_online;
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/online");
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/possible");
list_for_each_entry(p, &udp->node_topology_list, chain) {
struct sysfs_handle handle;
int cpu;
size_t offset = 0;
param.nbits = nr_cpumask_bits;
param.ptr = &p->cpumap;
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
if (node > 0) {
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%s", " ");
}
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%d",
node_distance(
mckernel_numa_2_linux_numa(udp, p->mckernel_numa_id),
mckernel_numa_2_linux_numa(udp, node)
));
}
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_s,
p->mckernel_numa_distance_s, 0444,
"/sys/devices/system/node/node%d/distance",
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pb, &param, 0444,
"/sys/devices/system/node/node%d/cpumap",
p->saved->node_number);
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/node%d/cpulist",
p->saved->node_number);
p->mckernel_numa_id);
/* Add CPU symlinks for this node */
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
if (linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))
!= p->mckernel_numa_id) {
continue;
}
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("sysfsm_lookupf(CPU in node)");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/node/node%d/cpu%d",
p->mckernel_numa_id, cpu);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
}
error = 0;
@ -1026,11 +1081,18 @@ void setup_sysfs_files(ihk_os_t os)
panic("sysfsm_unlinkf");
}
setup_local_snooping_samples(os);
//setup_local_snooping_samples(os);
setup_local_snooping_files(os);
setup_cpu_files(udp);
setup_cpus_sysfs_files(udp);
setup_node_files(udp);
setup_pci_files(udp);
setup_cpus_sysfs_files_node_link(udp);
//setup_pci_files(udp);
/* Indicate sysfs files setup completion for boot script */
error = sysfsm_mkdirf(os, NULL, "/sys/setup_complete");
if (error) {
panic("sysfsm_mkdir(complete)");
}
return;
} /* setup_files() */

View File

@ -14,6 +14,9 @@ ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
@ -32,6 +35,7 @@ endif
clean:
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
@(cd linux-4.6.7; make clean)
install:
ifneq ($(BUILD_MODULE),none)

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,460 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static bool __read_mostly ovl_check_copy_up;
module_param_named(check_copy_up, ovl_check_copy_up, bool,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
if (f->f_inode == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
if (OVL_OPT_NOFSCHECK(opt)) {
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
continue;
} else {
error = size;
break;
}
}
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
unsigned opt = ovl_get_config_opt(dentry);
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
if (err)
goto out_cleanup;
inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out2;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,969 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
inode_unlock(udir);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
unsigned opt = ovl_get_config_opt(dentry);
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir, opt);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
inode_lock_nested(dir, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
inode_unlock(dir);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
/*
* Old dentry now lives in different location. Dentries in
* lowerstack are stale. We cannot drop them here because
* access to them is lockless. This could be only pure upper
* or opaque directory - numlower is zero. Or upper non-dir
* entry - its pureness is tracked by flag opaque.
*/
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,494 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
/*
* Check for permissions before trying to copy-up. This is redundant
* since it will be rechecked later by ->setattr() on upper dentry. But
* without this, copy-up can be triggered by just about anybody.
*
* We don't initialize inode->size, which just means that
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
err = inode_change_ok(dentry->d_inode, attr);
if (err)
return err;
err = ovl_want_write(dentry);
if (err)
goto out;
if (attr->ia_valid & ATTR_SIZE) {
struct inode *realinode = d_inode(ovl_dentry_real(dentry));
err = -ETXTBSY;
if (atomic_read(&realinode->i_writecount) < 0)
goto out_drop_write;
}
err = ovl_copy_up(dentry);
if (!err) {
struct inode *winode = NULL;
upperdentry = ovl_dentry_upper(dentry);
if (attr->ia_valid & ATTR_SIZE) {
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
goto out_drop_write;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
put_write_access(winode);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
ovl_reset_ovl_entry(&oe, alias);
}
realdentry = ovl_entry_real(oe, &is_upper);
if (ovl_is_default_permissions(inode)) {
struct kstat stat;
struct path realpath = { .dentry = realdentry };
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
err = vfs_getattr(&realpath, &stat);
if (err)
goto out_dput;
err = -ESTALE;
if ((stat.mode ^ inode->i_mode) & S_IFMT)
goto out_dput;
inode->i_mode = stat.mode;
inode->i_uid = stat.uid;
inode->i_gid = stat.gid;
err = generic_permission(inode, mask);
goto out_dput;
}
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
if (!dentry)
return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
int err;
struct path realpath;
enum ovl_path_type type;
unsigned opt = ovl_get_config_opt(dentry);
if (d_is_dir(dentry))
return d_backing_inode(dentry);
type = ovl_path_real(dentry, &realpath);
if (!OVL_OPT_NOCOPYUPW(opt) &&
ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
OVL_DEBUG("copyup: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
err = ovl_want_write(dentry);
if (err)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
if (err)
return ERR_PTR(err);
ovl_path_upper(dentry, &realpath);
}
if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
if (OVL_OPT_NOFSCHECK(opt)) {
if (realpath.dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC) {
OVL_DEBUG("sysfs: dentry=%pd4, i_ino=%lu\n",
dentry, dentry->d_inode->i_ino);
OVL_DEBUG("sysfs: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
if (!dentry->d_inode->i_private) {
dentry->d_inode->i_private = dentry->d_fsdata;
dentry->d_fsdata = realpath.dentry->d_fsdata;
}
}
}
return d_backing_inode(realpath.dentry);
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,222 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
//#define DEBUG
#ifdef DEBUG
#define OVL_DEBUG(format, ...) pr_err("[DEBUG] %s(): " format, __FUNCTION__, ##__VA_ARGS__)
#else
#define OVL_DEBUG(format, ...) {}
#endif
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
enum ovl_opt_bit {
__OVL_OPT_DEFAULT = 0,
__OVL_OPT_NOCOPYUPW = (1 << 0),
__OVL_OPT_NOFSCHECK = (1 << 1),
};
#define OVL_OPT_NOCOPYUPW(opt) ((opt) & __OVL_OPT_NOCOPYUPW)
#define OVL_OPT_NOFSCHECK(opt) ((opt) & __OVL_OPT_NOFSCHECK)
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
unsigned ovl_get_config_opt(struct dentry *dentry);
void ovl_reset_ovl_entry(struct ovl_entry **oe, struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
to->i_mode = from->i_mode;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,616 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
bool d_type_supported;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
inode_unlock(file_inode(file));
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
inode_unlock(inode);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
if (dentry->d_inode)
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
}
static int ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
return 0;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
return 0;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
int ovl_check_d_type_supported(struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
.d_type_supported = false,
};
err = ovl_dir_read(realpath, &rdd);
if (err)
return err;
return rdd.d_type_supported;
}

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,20 @@
#include <sys/socket.h>
#include <arpa/inet.h>
/* From ihk/linux/include/ihk/ihk_host_user.h */
#define PHYS_CHUNKS_DESC_SIZE 8192
struct dump_mem_chunk {
unsigned long addr;
unsigned long size;
};
typedef struct dump_mem_chunks_s {
int nr_chunks;
struct dump_mem_chunk chunks[];
} dump_mem_chunks_t;
/* ---------- */
#define CPU_TID_BASE 1000000
struct options {
@ -53,6 +67,7 @@ static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static dump_mem_chunks_t *mem_chunks;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
@ -91,25 +106,35 @@ static uintptr_t virt_to_phys(uintptr_t va) {
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
int i;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
off = 0;
/* Check if pa is valid in any chunks and figure
* out the global offset in dump section */
for (i = 0; i < mem_chunks->nr_chunks; ++i) {
if (mem_chunks->chunks[i].addr <= pa &&
((pa + size) <= (mem_chunks->chunks[i].addr +
mem_chunks->chunks[i].size))) {
off += (pa - mem_chunks->chunks[i].addr);
break;
}
off += mem_chunks->chunks[i].size;
}
if (i == mem_chunks->nr_chunks) {
printf("read_physmem: invalid addr 0x%lx\n", pa);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
@ -508,6 +533,25 @@ static int setup_dump(char *fname) {
return 1;
}
mem_chunks = malloc(PHYS_CHUNKS_DESC_SIZE);
if (!mem_chunks) {
perror("allocating mem chunks descriptor: ");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physchunks");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, mem_chunks,
0, PHYS_CHUNKS_DESC_SIZE);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");

View File

@ -41,6 +41,7 @@
#include <sys/mman.h>
#include <asm/unistd.h>
#include <sched.h>
#include <dirent.h>
#include <termios.h>
#include <sys/ioctl.h>
@ -107,6 +108,9 @@ char **__glob_argv = 0;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.6
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
@ -1148,75 +1152,41 @@ void init_worker_threads(int fd)
#ifdef ENABLE_MCOVERLAYFS
#define READ_BUFSIZE 1024
static int isunshare(void)
static int find_mount_prefix(char *prefix)
{
int err = 0;
int ret;
int fd;
FILE *fp;
char *line = NULL;
size_t len = 0;
ssize_t read;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
int ret = 0;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
fp = fopen(proc_path, "r");
if (fp == NULL) {
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
while ((read = getline(&line, &len, fp)) != -1) {
if (strlen(line) < strlen(prefix))
continue;
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
if (!strncmp(line, prefix, strlen(prefix))) {
ret = 1;
break;
}
}
close(fd);
if (line)
free(line);
__dprintf("err=%d\n", err);
return err;
return ret;
}
static int isunshare(void)
{
return find_mount_prefix("mcoverlay /proc ");
}
#endif // ENABLE_MCOVERLAYFS
@ -1415,6 +1385,7 @@ int main(int argc, char **argv)
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
struct sys_umount_desc umount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
@ -1426,6 +1397,53 @@ int main(int argc, char **argv)
return 1;
}
/*
* Umount cgroup filesystems that may expose invalid NUMA
* information
*/
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu,cpuacct")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu,cpuacct";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu,cpuacct. (%s)\n",
strerror(errno));
}
}
else if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpuset")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpuset";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpuset. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/memory")) {
umount_desc.dir_name = "/sys/fs/cgroup/memory/";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/memory. (%s)\n",
strerror(errno));
}
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
@ -1533,9 +1551,6 @@ int main(int argc, char **argv)
}
n_threads = ncpu;
if (ncpu > 16) {
n_threads = 16;
}
/*
* XXX: keep thread_data ncpu sized despite that there are only
@ -1546,6 +1561,10 @@ int main(int argc, char **argv)
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
if (!thread_data) {
fprintf(stderr, "error: allocating thread pool data\n");
return 1;
}
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
#if 0
@ -1686,6 +1705,97 @@ do_generic_syscall(
ret = -errno;
}
/* Overlayfs /sys/X directory lseek() problem work around */
if (w->sr.number == __NR_lseek && ret == -EINVAL) {
char proc_path[512];
char path[512];
struct stat sb;
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if (readlink(proc_path, path, sizeof(path)) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
/* Not in /sys? */
if (strncmp(path, "/sys/", 5))
goto out;
/* Stat */
if (stat(path, &sb) < 0) {
fprintf(stderr, "%s: error stat() failed for %s\n",
__FUNCTION__, path);
goto out;
}
/* Not dir? */
if ((sb.st_mode & S_IFMT) != S_IFDIR)
goto out;
ret = 0;
}
/* Fake that nodeX in /sys/devices/system/node do not exist,
* where X >= number of LWK NUMA nodes */
else if (w->sr.number == __NR_getdents && ret > 0) {
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
char *buf = (char *)w->sr.args[1];
int bpos = 0;
int nodes,len;
char proc_path[PATH_MAX];
char path[PATH_MAX];
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if ((len = readlink(proc_path, path, sizeof(path))) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
path[len] = 0;
/* Not /sys/devices/system/node ? */
if (strcmp(path, "/sys/devices/system/node"))
goto out;
nodes = ioctl(fd, MCEXEC_UP_GET_NODES, 0);
if (nodes == -1) {
goto out;
}
d = (struct linux_dirent *) (buf + bpos);
for (bpos = 0; bpos < ret; ) {
int nodeid, tmp_reclen;
d = (struct linux_dirent *) (buf + bpos);
if (sscanf(d->d_name, "node%d", &nodeid) != 1) {
bpos += d->d_reclen;
continue;
}
if (nodeid >= nodes) {
tmp_reclen = d->d_reclen;
memmove(buf + bpos,
buf + bpos + tmp_reclen,
ret - bpos - tmp_reclen);
ret -= tmp_reclen;
continue;
}
bpos += d->d_reclen;
}
}
out:
__dprintf("do_generic_syscall(%ld):%ld (%#lx)\n", w->sr.number, ret, ret);
return ret;
}
@ -1695,7 +1805,7 @@ kill_thread(unsigned long tid)
{
int i;
for (i = 0; i < n_threads; ++i) {
for (i = 0; i <= n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
@ -1954,21 +2064,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
#endif
case __NR_gettid:{
int mode = w.sr.args[0];
int remote_pid = w.sr.args[1];
int newcpuid = w.sr.args[2];
int oldcpuid = w.sr.args[3];
int wtid = thread_data[newcpuid].remote_tid;
if(mode == 0){
thread_data[ncpu].remote_tid = wtid;
thread_data[newcpuid].remote_tid = remote_pid;
}
else if(mode == 2){
thread_data[newcpuid].remote_tid = thread_data[oldcpuid].remote_tid;
thread_data[oldcpuid].remote_tid = wtid;
}
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
@ -2002,7 +2097,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
break;
}
@ -2328,6 +2423,23 @@ return_execve1:
ret = 0;
return_execve2:
#ifdef ENABLE_MCOVERLAYFS
{
struct sys_mount_desc mount_desc;
mount_desc.dev_name = NULL;
mount_desc.dir_name = "/proc";
mount_desc.type = NULL;
mount_desc.flags = MS_REMOUNT;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr,
"WARNING: failed to remount /proc (%s)\n",
strerror(errno));
}
}
#endif
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;

View File

@ -6,12 +6,12 @@ OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g -fno-omit-frame-pointer -fno-inline -fno-inline-small-functions
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o
include $(SRC)/config/config.$(TARGET)
include $(IHKBASE)/Makefile.common
include @abs_builddir@/../../ihk/cokernel/Makefile.common
# CFLAGS += -I$(SRC)/../arch/$(IHKARCH)/kernel/include -I$(SRC)/../lib/include

View File

@ -9,7 +9,7 @@ V ?= $(VERBOSE)
KERNEL = kernel.img
KERNELS = $(addsuffix /$(KERNEL),$(addprefix $(O)/,$(BUILD_TARGET)))
SUBCMD_OPTS = V='$(V)'
SUBCMD_OPTS = V='$(V)' BUILD_IHK_COKERNEL=@abs_builddir@/../../ihk/cokernel
$(if $(O),,$(error Specify the compilation target directory))
#$(if $(shell ls $(IHKBASE)/Makefile),,\

View File

@ -65,7 +65,7 @@ void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
@ -78,13 +78,23 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d\n", bsp_hw_id);
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
kprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
num_processors++;
@ -199,7 +209,7 @@ cpu_sysfs_setup(void)
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
info[cpu].online = 1;
}
fake_cpu_infos = info;

View File

@ -32,7 +32,7 @@ void cpu_local_var_init(void)
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}

View File

@ -37,6 +37,8 @@ static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
@ -70,6 +72,9 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
@ -170,6 +175,17 @@ int kprintf(const char *format, ...)
return len;
}
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);

View File

@ -258,13 +258,24 @@ static void fileobj_release(struct memobj *memobj)
/* zap page_list */
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, 1);
}
#if 0
count = ihk_atomic_sub_return(1, &page->count);
if (!((page->mode == PM_WILL_PAGEIO)
@ -281,7 +292,7 @@ static void fileobj_release(struct memobj *memobj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1);
#endif
}
obj_list_remove(free_obj);
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
@ -430,7 +441,7 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
panic("fileobj_get_page:invalid new page");
}
@ -502,10 +513,10 @@ static uintptr_t fileobj_copy_page(
memobj_lock(memobj);
for (;;) {
if (orgpage->mode != PM_MAPPED) {
if (!orgpage || orgpage->mode != PM_MAPPED) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"invalid cow page. %x\n",
memobj, orgpa, p2align, orgpage->mode);
memobj, orgpa, p2align, orgpage ? orgpage->mode : 0);
panic("fileobj_copy_page:invalid cow page");
}
count = ihk_atomic_read(&orgpage->count);
@ -527,7 +538,9 @@ static uintptr_t fileobj_copy_page(
memcpy(newkva, orgkva, pgsize);
ihk_atomic_dec(&orgpage->count);
newpa = virt_to_phys(newkva);
page_map(phys_to_page(newpa));
if (phys_to_page(newpa)) {
page_map(phys_to_page(newpa));
}
newkva = NULL; /* avoid ihk_mc_free_pages() */
break;
}
@ -563,6 +576,11 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
ssize_t ss;
page = phys_to_page(phys);
if (!page) {
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
__FUNCTION__, phys);
return 0;
}
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;

View File

@ -534,31 +534,6 @@ extern void process_procfs_request(unsigned long rarg);
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
{
@ -613,8 +588,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1, 0, NULL);
thread->tid = proc->pid;
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
@ -688,15 +662,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",

View File

@ -32,4 +32,6 @@ extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name);
extern int num_processors;
#endif

View File

@ -17,8 +17,9 @@
struct page {
struct list_head list;
struct list_head hash;
uint8_t mode;
uint8_t padding[3];
uint64_t phys;
ihk_atomic_t count;
off_t offset;
};
@ -38,9 +39,8 @@ enum page_mode {
struct page *phys_to_page(uintptr_t phys);
uintptr_t page_to_phys(struct page *page);
int page_unmap(struct page *page);
struct page *phys_to_page_insert_hash(uint64_t phys);
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
void free_pages(void *va, int npages);
void begin_free_pages_pending(void);
void finish_free_pages_pending(void);

View File

@ -22,6 +22,7 @@
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#include <bitops.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
@ -165,9 +166,74 @@
#define NOPHYS ((uintptr_t)-1)
#define PROCESS_NUMA_MASK_BITS 64
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
enum mpol_rebind_step {
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
MPOL_REBIND_NSTEP,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#include <waitq.h>
#include <futex.h>
//#define TRACK_SYSCALLS
struct resource_set;
struct process_hash;
struct thread_hash;
@ -305,6 +371,13 @@ struct vm_range {
int padding;
};
struct vm_range_numa_policy {
struct list_head list;
unsigned long start, end;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
@ -334,7 +407,7 @@ struct mckfd {
#define SFD_NONBLOCK 04000
struct sig_common {
ihk_spinlock_t lock;
mcs_rwlock_lock_t lock;
ihk_atomic_t use;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
@ -395,7 +468,7 @@ struct process {
// V +---- |
// PS_STOPPED -----+
// (PS_TRACED)
int exit_status;
int exit_status; // only for zombie
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
@ -507,6 +580,7 @@ struct thread {
// PS_TRACED
// PS_INTERRPUTIBLE
// PS_UNINTERRUPTIBLE
int exit_status;
// process vm
struct process_vm *vm;
@ -537,12 +611,20 @@ struct thread {
fp_regs_struct *fp_regs;
int in_syscall_offload;
#ifdef TRACK_SYSCALLS
int socc_enabled;
uint64_t *syscall_times;
uint32_t *syscall_cnts;
uint64_t *offload_times;
uint32_t *offload_cnts;
#endif // TRACK_SYSCALLS
// signal
struct sig_common *sigcommon;
sigset_t sigmask;
stack_t sigstack;
struct list_head sigpending;
ihk_spinlock_t sigpendinglock;
mcs_rwlock_lock_t sigpendinglock;
volatile int sigevent;
// gpio
@ -572,6 +654,8 @@ struct thread {
struct waitq scd_wq;
};
#define VM_RANGE_CACHE_SIZE 4
struct process_vm {
struct address_space *address_space;
struct list_head vm_range_list;
@ -594,6 +678,12 @@ struct process_vm {
int exiting;
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
/* Protected by memory_range_lock */
struct list_head vm_range_numa_policy_list;
struct vm_range *range_cache[VM_RANGE_CACHE_SIZE];
int range_cache_ind;
};
static inline int has_cap_ipc_lock(struct thread *th)
@ -690,7 +780,5 @@ void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif

View File

@ -331,7 +331,7 @@ void delete_proc_procfs_files(int pid);
void create_os_procfs_files(void);
void delete_os_procfs_files(void);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */

View File

@ -320,7 +320,7 @@ static void setup_remote_snooping_samples(void)
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
//setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0;

File diff suppressed because it is too large Load Diff

View File

@ -74,7 +74,6 @@ init_process(struct process *proc, struct process *parent)
{
/* These will be filled out when changing status */
proc->pid = -1;
proc->exit_status = -1;
proc->status = PS_RUNNING;
if(parent){
@ -204,15 +203,33 @@ detach_address_space(struct address_space *asp, int pid)
static int
init_process_vm(struct process *owner, struct address_space *asp, struct process_vm *vm)
{
int i;
ihk_mc_spinlock_init(&vm->memory_range_lock);
ihk_mc_spinlock_init(&vm->page_table_lock);
ihk_atomic_set(&vm->refcount, 1);
INIT_LIST_HEAD(&vm->vm_range_list);
INIT_LIST_HEAD(&vm->vm_range_numa_policy_list);
vm->address_space = asp;
vm->proc = owner;
vm->exiting = 0;
memset(&vm->numa_mask, 0, sizeof(vm->numa_mask));
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
if (i >= PROCESS_NUMA_MASK_BITS) {
kprintf("%s: error: NUMA id is larger than mask size!\n",
__FUNCTION__);
break;
}
set_bit(i, &vm->numa_mask[0]);
}
vm->numa_mem_policy = MPOL_DEFAULT;
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
vm->range_cache[i] = NULL;
}
vm->range_cache_ind = 0;
return 0;
}
@ -260,10 +277,10 @@ create_thread(unsigned long user_pc)
dkprintf("fork(): sigshared\n");
ihk_atomic_set(&thread->sigcommon->use, 1);
ihk_mc_spinlock_init(&thread->sigcommon->lock);
mcs_rwlock_init(&thread->sigcommon->lock);
INIT_LIST_HEAD(&thread->sigcommon->sigpending);
ihk_mc_spinlock_init(&thread->sigpendinglock);
mcs_rwlock_init(&thread->sigpendinglock);
INIT_LIST_HEAD(&thread->sigpending);
thread->sigstack.ss_sp = NULL;
@ -280,6 +297,7 @@ create_thread(unsigned long user_pc)
if(init_process_vm(proc, asp, vm) != 0){
goto err;
}
thread->exit_status = -1;
cpu_set(ihk_mc_get_processor_id(), &thread->vm->address_space->cpu_set,
&thread->vm->address_space->cpu_set_lock);
@ -371,6 +389,11 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
kfree(proc);
goto err_free_proc;
}
memcpy(&proc->vm->numa_mask, &org->vm->numa_mask,
sizeof(proc->vm->numa_mask));
proc->vm->numa_mem_policy =
org->vm->numa_mem_policy;
thread->proc = proc;
thread->vm = proc->vm;
@ -418,11 +441,11 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
memcpy(thread->sigcommon->action, org->sigcommon->action,
sizeof(struct k_sigaction) * _NSIG);
ihk_atomic_set(&thread->sigcommon->use, 1);
ihk_mc_spinlock_init(&thread->sigcommon->lock);
mcs_rwlock_init(&thread->sigcommon->lock);
INIT_LIST_HEAD(&thread->sigcommon->sigpending);
// TODO: copy signalfd
}
ihk_mc_spinlock_init(&thread->sigpendinglock);
mcs_rwlock_init(&thread->sigpendinglock);
INIT_LIST_HEAD(&thread->sigpending);
thread->sigmask = org->sigmask;
@ -716,6 +739,7 @@ int join_process_memory_range(struct process_vm *vm,
struct vm_range *surviving, struct vm_range *merging)
{
int error;
int i;
dkprintf("join_process_memory_range(%p,%lx-%lx,%lx-%lx)\n",
vm, surviving->start, surviving->end,
@ -744,6 +768,10 @@ int join_process_memory_range(struct process_vm *vm,
memobj_release(merging->memobj);
}
list_del(&merging->list);
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
if (vm->range_cache[i] == merging)
vm->range_cache[i] = surviving;
}
kfree(merging);
error = 0;
@ -757,7 +785,7 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
{
const intptr_t start0 = range->start;
const intptr_t end0 = range->end;
int error;
int error, i;
intptr_t start;
intptr_t end;
struct vm_range *neighbor;
@ -842,6 +870,10 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
}
list_del(&range->list);
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
if (vm->range_cache[i] == range)
vm->range_cache[i] = NULL;
}
kfree(range);
dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
@ -1058,6 +1090,7 @@ int add_process_memory_range(struct process_vm *vm,
struct vm_range *lookup_process_memory_range(
struct process_vm *vm, uintptr_t start, uintptr_t end)
{
int i;
struct vm_range *range = NULL;
dkprintf("lookup_process_memory_range(%p,%lx,%lx)\n", vm, start, end);
@ -1066,6 +1099,16 @@ struct vm_range *lookup_process_memory_range(
goto out;
}
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
int c_i = (i + vm->range_cache_ind) % VM_RANGE_CACHE_SIZE;
if (!vm->range_cache[c_i])
continue;
if (vm->range_cache[c_i]->start <= start &&
vm->range_cache[c_i]->end >= end)
return vm->range_cache[c_i];
}
list_for_each_entry(range, &vm->vm_range_list, list) {
if (end <= range->start) {
break;
@ -1077,6 +1120,12 @@ struct vm_range *lookup_process_memory_range(
range = NULL;
out:
if (range) {
vm->range_cache_ind = (vm->range_cache_ind - 1 + VM_RANGE_CACHE_SIZE)
% VM_RANGE_CACHE_SIZE;
vm->range_cache[vm->range_cache_ind] = range;
}
dkprintf("lookup_process_memory_range(%p,%lx,%lx): %p %lx-%lx\n",
vm, start, end, range,
range? range->start: 0, range? range->end: 0);
@ -1517,7 +1566,7 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang
}
}
if (phys == NOPHYS) {
void *virt;
void *virt = NULL;
size_t npages;
retry:
@ -1542,7 +1591,9 @@ retry:
__FUNCTION__, pgaddr, pgsize);
memset(virt, 0, pgsize);
phys = virt_to_phys(virt);
page_map(phys_to_page(phys));
if (phys_to_page(phys)) {
page_map(phys_to_page(phys));
}
}
}
else {
@ -1554,10 +1605,12 @@ retry:
attr = arch_vrflag_to_ptattr(range->flag | memobj_flag, reason, ptep);
/*****/
if (((range->flag & VR_PRIVATE)
|| ((reason & PF_PATCH)
&& !(range->flag & VR_PROT_WRITE)))
&& (!page || page_is_in_memobj(page) || page_is_multi_mapped(page))) {
if (((range->flag & VR_PRIVATE) ||
((reason & PF_PATCH) && !(range->flag & VR_PROT_WRITE)))
&& ((!page && phys == NOPHYS) || (page &&
(page_is_in_memobj(page) ||
page_is_multi_mapped(page))))) {
if (!(attr & PTATTR_DIRTY)) {
attr &= ~PTATTR_WRITABLE;
}
@ -1637,10 +1690,9 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
range = lookup_process_memory_range(vm, fault_addr, fault_addr+1);
if (range == NULL) {
error = -EFAULT;
dkprintf("[%d]do_page_fault_process_vm(%p,%lx,%lx):"
"out of range. %d\n",
ihk_mc_get_processor_id(), vm,
fault_addr0, reason, error);
dkprintf("do_page_fault_process_vm(): vm: %p, addr: %p, reason: %lx):"
"out of range: %d\n",
vm, fault_addr0, reason, error);
goto out;
}
@ -1670,10 +1722,18 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
kprintf("if (((range->flag & VR_PROT_MASK) == VR_PROT_NONE))\n");
if (((reason & PF_WRITE) && !(reason & PF_PATCH)))
kprintf("if (((reason & PF_WRITE) && !(reason & PF_PATCH)))\n");
if (!(range->flag & VR_PROT_WRITE))
if (!(range->flag & VR_PROT_WRITE)) {
kprintf("if (!(range->flag & VR_PROT_WRITE))\n");
if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC))
//kprintf("setting VR_PROT_WRITE\n");
//range->flag |= VR_PROT_WRITE;
//goto cont;
}
if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC)) {
kprintf("if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC))\n");
//kprintf("setting VR_PROT_EXEC\n");
//range->flag |= VR_PROT_EXEC;
//goto cont;
}
goto out;
}
@ -2462,6 +2522,7 @@ void sched_init(void)
ihk_mc_init_context(&idle_thread->ctx, NULL, idle);
ihk_mc_spinlock_init(&idle_thread->vm->memory_range_lock);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_list);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_numa_policy_list);
idle_thread->proc->pid = 0;
idle_thread->tid = ihk_mc_get_processor_id();
@ -2551,7 +2612,6 @@ static void do_migrate(void)
v->flags |= CPU_FLAG_NEED_RESCHED;
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu_id)->apic_id, 0xd1);
double_rq_unlock(cur_v, v, irqstate);
//settid(req->thread, 2, cpu_id, old_cpu_id, 0, NULL);
ack:
waitq_wakeup(&req->wq);
@ -2664,7 +2724,9 @@ redo:
restore_fp_regs(next);
}
ihk_mc_load_page_table(next->vm->address_space->page_table);
if (prev && prev->vm->address_space->page_table !=
next->vm->address_space->page_table)
ihk_mc_load_page_table(next->vm->address_space->page_table);
dkprintf("[%d] schedule: tlsblock_base: 0x%lX\n",
ihk_mc_get_processor_id(), next->tlsblock_base);
@ -2896,6 +2958,7 @@ find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock)
if(tid <= 0)
return NULL;
mcs_rwlock_reader_lock(&thash->lock[hash], lock);
retry:
list_for_each_entry(thread, &thash->list[hash], hash_list){
if(thread->tid == tid){
if(pid <= 0)
@ -2904,6 +2967,13 @@ find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock)
return thread;
}
}
/* If no thread with pid == tid was found, then we may be looking for a
* specific thread (not the main thread of the process), try to find it
* based on tid only */
if (pid > 0 && pid == tid) {
pid = 0;
goto retry;
}
mcs_rwlock_reader_unlock(&thash->lock[hash], lock);
return NULL;
}

View File

@ -23,6 +23,8 @@
#include <process.h>
#include <page.h>
#include <mman.h>
#include <bitmap.h>
#include <init.h>
//#define DEBUG_PRINT_PROCFS
@ -35,6 +37,7 @@
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
extern int osnum;
@ -404,12 +407,34 @@ process_procfs_request(unsigned long rarg)
/*
* mcos%d/PID/status
*/
#define BITMASKS_BUF_SIZE 2048
if (strcmp(p, "status") == 0) {
extern int num_processors; /* kernel/ap.c */
struct vm_range *range;
unsigned long lockedsize = 0;
char tmp[1024];
char *tmp;
char *bitmasks;
int bitmasks_offset = 0;
char *cpu_bitmask, *cpu_list, *numa_bitmask, *numa_list;
int len;
tmp = kmalloc(8192, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status buffer\n",
__FUNCTION__);
ans = 0;
goto end;
}
bitmasks = kmalloc(BITMASKS_BUF_SIZE, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status bitmaks buffer\n",
__FUNCTION__);
kfree(tmp);
ans = 0;
goto end;
}
ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock);
list_for_each_entry(range, &proc->vm->vm_range_list, list) {
if(range->flag & VR_LOCKED)
@ -417,13 +442,42 @@ process_procfs_request(unsigned long rarg)
}
ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock);
cpu_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(cpu_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, num_processors);
bitmasks_offset++;
cpu_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(cpu_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, __CPU_SETSIZE);
bitmasks_offset++;
numa_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(numa_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
numa_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(numa_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
sprintf(tmp,
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n"
"VmLck:\t%9lu kB\n",
"VmLck:\t%9lu kB\n"
"Cpus_allowed:\t%s\n"
"Cpus_allowed_list:\t%s\n"
"Mems_allowed:\t%s\n"
"Mems_allowed_list:\t%s\n",
proc->ruid, proc->euid, proc->suid, proc->fsuid,
proc->rgid, proc->egid, proc->sgid, proc->fsgid,
(lockedsize + 1023) >> 10);
(lockedsize + 1023) >> 10,
cpu_bitmask, cpu_list, numa_bitmask, numa_list);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
@ -437,6 +491,8 @@ process_procfs_request(unsigned long rarg)
ans = 0;
eof = 1;
}
kfree(tmp);
kfree(bitmasks);
goto end;
}

View File

@ -240,14 +240,24 @@ void shmobj_destroy(struct shmobj *obj)
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, npages);
}
#if 0
dkprintf("shmobj_destroy(%p):"
"release page. %p %#lx %d %d",
obj, page, page_to_phys(page),
@ -265,7 +275,8 @@ void shmobj_destroy(struct shmobj *obj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), npages);
ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages);
#endif
}
if (obj->index < 0) {
kfree(obj);
@ -404,7 +415,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("shmobj_get_page(%p,%#lx,%d,%p):"
"page %p %#lx %d %d %#lx\n",

File diff suppressed because it is too large Load Diff

View File

@ -112,7 +112,7 @@ static int alloc_zeroobj(void)
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("alloc_zeroobj():"

1179
lib/bitmap.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -140,3 +140,58 @@ found:
return result + ffz(tmp);
}
/**
* hweightN - returns the hamming weight of a N-bit word
* @x: the word to weigh
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
w = (w + (w >> 4)) & 0x0f0f0f0f;
return (w * 0x01010101) >> 24;
#else
unsigned int res = w - ((w >> 1) & 0x55555555);
res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
res = (res + (res >> 4)) & 0x0F0F0F0F;
res = res + (res >> 8);
return (res + (res >> 16)) & 0x000000FF;
#endif
}
unsigned int __sw_hweight16(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x5555);
res = (res & 0x3333) + ((res >> 2) & 0x3333);
res = (res + (res >> 4)) & 0x0F0F;
return (res + (res >> 8)) & 0x00FF;
}
unsigned int __sw_hweight8(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x55);
res = (res & 0x33) + ((res >> 2) & 0x33);
return (res + (res >> 4)) & 0x0F;
}
unsigned long __sw_hweight64(uint64_t w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x5555555555555555ul;
w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
return (w * 0x0101010101010101ul) >> 56;
#else
uint64_t res = w - ((w >> 1) & 0x5555555555555555ul);
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
res = res + (res >> 8);
res = res + (res >> 16);
return (res + (res >> 32)) & 0x00000000000000FFul;
#endif
}

307
lib/include/bitmap.h Normal file
View File

@ -0,0 +1,307 @@
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H
#include <types.h>
#include <bitops.h>
#include <string.h>
/*
* bitmaps provide bit arrays that consume one or more unsigned
* longs. The bitmap interface and available operations are listed
* here, in bitmap.h
*
* Function implementations generic to all architectures are in
* lib/bitmap.c. Functions implementations that are architecture
* specific are in various include/asm-<arch>/bitops.h headers
* and other arch/<arch> specific files.
*
* See lib/bitmap.c for more details.
*/
/*
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
* Note that nbits should be always a compile time evaluable constant.
* Otherwise many inlines will generate horrible code.
*
* bitmap_zero(dst, nbits) *dst = 0UL
* bitmap_fill(dst, nbits) *dst = ~0UL
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
* bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal?
* bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap?
* bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2?
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
* bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
* bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
* bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
* bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf
* bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf
* bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
* bitmap_release_region(bitmap, pos, order) Free specified bit region
* bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region
*/
/*
* Also the following operations in asm/bitops.h apply to bitmaps.
*
* set_bit(bit, addr) *addr |= bit
* clear_bit(bit, addr) *addr &= ~bit
* change_bit(bit, addr) *addr ^= bit
* test_bit(bit, addr) Is bit set in *addr?
* test_and_set_bit(bit, addr) Set bit and return old value
* test_and_clear_bit(bit, addr) Clear bit and return old value
* test_and_change_bit(bit, addr) Change bit and return old value
* find_first_zero_bit(addr, nbits) Position first zero bit in *addr
* find_first_bit(addr, nbits) Position first set bit in *addr
* find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit
* find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit
*/
/*
* The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
* to declare an array named 'name' of just enough unsigned longs to
* contain all bit positions from 0 to 'bits' - 1.
*/
/*
* lib/bitmap.c provides these functions:
*/
#define __user
#define __force
#define u32 uint32_t
extern int __bitmap_empty(const unsigned long *bitmap, int bits);
extern int __bitmap_full(const unsigned long *bitmap, int bits);
extern int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
int bits);
extern void __bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern void __bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern void bitmap_set(unsigned long *map, int i, int len);
extern void bitmap_clear(unsigned long *map, int start, int nr);
extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask);
extern int bitmap_scnprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern int bitmap_scnlistprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, int bits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
int sz, int bits);
extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
#define BITMAP_LAST_WORD_MASK(nbits) \
( \
((nbits) % BITS_PER_LONG) ? \
(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \
)
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
static inline void bitmap_zero(unsigned long *dst, int nbits)
{
if (small_const_nbits(nbits))
*dst = 0UL;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
}
static inline void bitmap_fill(unsigned long *dst, int nbits)
{
size_t nlongs = BITS_TO_LONGS(nbits);
if (!small_const_nbits(nbits)) {
int len = (nlongs - 1) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = *src;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
}
static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & *src2) != 0;
return __bitmap_and(dst, src1, src2, nbits);
}
static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 | *src2;
else
__bitmap_or(dst, src1, src2, nbits);
}
static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 ^ *src2;
else
__bitmap_xor(dst, src1, src2, nbits);
}
static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & ~(*src2)) != 0;
return __bitmap_andnot(dst, src1, src2, nbits);
}
static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_complement(dst, src, nbits);
}
static inline int bitmap_equal(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_equal(src1, src2, nbits);
}
static inline int bitmap_intersects(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
else
return __bitmap_intersects(src1, src2, nbits);
}
static inline int bitmap_subset(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_subset(src1, src2, nbits);
}
static inline int bitmap_empty(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_empty(src, nbits);
}
static inline int bitmap_full(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_full(src, nbits);
}
static inline int bitmap_weight(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
return __bitmap_weight(src, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src >> n;
else
__bitmap_shift_right(dst, src, n, nbits);
}
static inline void bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_shift_left(dst, src, n, nbits);
}
static inline int bitmap_parse(const char *buf, unsigned int buflen,
unsigned long *maskp, int nmaskbits)
{
return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
}
#endif /* __LINUX_BITMAP_H */

View File

@ -27,6 +27,31 @@ unsigned long find_first_bit(const unsigned long *addr,
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned long __sw_hweight64(uint64_t w);
static inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? __sw_hweight32(w) : __sw_hweight64(w);
}
#define BIT(nr) (1UL << (nr))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITS_PER_BYTE 8
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
#endif /*__ASSEMBLY__*/
#include <arch-bitops.h>

View File

@ -55,6 +55,7 @@ struct ihk_mc_cpu_info *ihk_mc_get_cpu_info(void);
void ihk_mc_boot_cpu(int cpuid, unsigned long pc);
int ihk_mc_get_processor_id(void);
int ihk_mc_get_hardware_processor_id(void);
int ihk_mc_get_numa_id(void);
void ihk_mc_delay_us(int us);
void ihk_mc_set_syscall_handler(long (*handler)(int, ihk_mc_user_context_t *));

View File

@ -72,8 +72,11 @@ struct ihk_mc_memory_node {
unsigned long ihk_mc_get_memory_address(enum ihk_mc_gma_type, int);
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int));
struct ihk_page_allocator_desc;
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
struct ihk_mc_pa_ops {
void *(*alloc_page)(int, int, enum ihk_mc_ap_flag);
@ -100,14 +103,28 @@ void ihk_mc_map_micpa(unsigned long host_pa, unsigned long* mic_pa);
int ihk_mc_free_micpa(unsigned long mic_pa);
void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages);
void *_ihk_mc_alloc_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag, char *file, int line);
#define ihk_mc_alloc_aligned_pages(npages, p2align, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages(npages, p2align, flag, __FILE__, __LINE__);\
r;\
})
void *_ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag,
char *file, int line);
#define ihk_mc_alloc_pages(npages, flag) ({\
void *r = _ihk_mc_alloc_pages(npages, flag, __FILE__, __LINE__);\
r;\
})
void _ihk_mc_free_pages(void *ptr, int npages, char *file, int line);
#define ihk_mc_free_pages(p, npages) ({\
_ihk_mc_free_pages(p, npages, __FILE__, __LINE__);\
})
void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag);
void arch_free_page(void *ptr);
int arch_get_smaller_page_size(void *args, size_t origsize, size_t *sizep, int *p2alignp);
typedef void *page_table_t;
@ -147,10 +164,24 @@ struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag);
/* XXX: proper use of struct page_table and page_table_t is unknown */
void ihk_mc_pt_destroy(struct page_table *pt);
void ihk_mc_load_page_table(struct page_table *pt);
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size);
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys);
uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt);
int ihk_mc_get_nr_numa_nodes(void);
struct smp_coreset;
int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type);
int ihk_mc_get_numa_distance(int i, int j);
int ihk_mc_get_nr_memory_chunks(void);
int ihk_mc_get_memory_chunk(int id,
unsigned long *start,
unsigned long *end,
int *numa_id);
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id);

View File

@ -14,14 +14,24 @@
#ifndef __HEADER_GENERIC_IHK_PAGE_ALLOC
#define __HEADER_GENERIC_IHK_PAGE_ALLOC
#include <list.h>
/* XXX: Physical memory management shouldn't be part of IHK */
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
int type;
struct list_head allocators;
};
struct ihk_page_allocator_desc {
unsigned long start;
unsigned long start, end;
unsigned int last;
unsigned int count;
unsigned int flag;
unsigned int shift;
ihk_spinlock_t lock;
unsigned int pad;
struct list_head list;
unsigned long map[0];
};

View File

@ -14,6 +14,7 @@
#define __STRING_H
#include <types.h>
#include <arch-string.h>
size_t strlen(const char *p);
size_t strnlen(const char *p, size_t maxlen);
@ -29,6 +30,17 @@ void *memcpy_long(void *dest, const void *src, size_t n);
int memcmp(const void *s1, const void *s2, size_t n);
void *memset(void *s, int n, size_t l);
#ifdef ARCH_FAST_MEMCPY
#define fast_memcpy __inline_memcpy
#else
#define fast_memcpy memcpy
#endif
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
unsigned long strtol(const char *cp, char **endp, unsigned int base);
int flatten_strings(int nr_strings, char *first, char **strings, char **flat);
int flatten_strings_from_user(int nr_strings, char *first, char **strings, char **flat);

View File

@ -52,7 +52,7 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
desc = initial;
*pdescsize = descsize;
} else {
desc = (void *)allocate_pages(descsize, IHK_MC_AP_CRITICAL);
desc = (void *)ihk_mc_alloc_pages(descsize, IHK_MC_AP_CRITICAL);
}
if (!desc) {
kprintf("IHK: failed to allocate page-allocator-desc "\
@ -64,13 +64,14 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
memset(desc, 0, descsize * PAGE_SIZE);
desc->start = start;
desc->end = start + size;
desc->last = 0;
desc->count = mapaligned >> 3;
desc->shift = page_shift;
desc->flag = flag;
kprintf("Page allocator: %lx - %lx (%d)\n", start, start + size,
page_shift);
//kprintf("page allocator @ %lx - %lx (%d)\n", start, start + size,
// page_shift);
ihk_mc_spinlock_init(&desc->lock);
@ -92,7 +93,7 @@ void ihk_pagealloc_destroy(void *__desc)
{
struct ihk_page_allocator_desc *desc = __desc;
free_pages(desc, desc->flag);
ihk_mc_free_pages(desc, desc->flag);
}
static unsigned long __ihk_pagealloc_large(struct ihk_page_allocator_desc *desc,