Compare commits

...

158 Commits
1.1.0 ... 1.2.2

Author SHA1 Message Date
ddc33821cf sched_yield(): avoid schedule for single thread 2016-12-05 18:10:20 +09:00
0ab7d02994 disable syscall tracker and eliminate interrupt_syscall debug msg 2016-12-05 18:10:20 +09:00
a8c4ab221b use MCS locks in signal handling code 2016-12-05 18:10:20 +09:00
87d36a7752 mcreboot-smp-x86: -t to enable turbo boost 2016-12-05 18:10:20 +09:00
998ded414c mcreboot-smp-x86: shorter sleep in waiting for /proc 2016-12-05 18:10:20 +09:00
f78d031e64 syscall and offload tracking (disabled by default) 2016-12-05 18:10:20 +09:00
4ab37dd34a schedule(): only load page table during context switch if it's different 2016-12-05 18:10:20 +09:00
8129dec2f7 Fix out-of-tree build
<build>/ihk/cokernel/Makefile.common is not found when
<build>/mckernel/kernel/Makfile tries to perform
"Make -C <build>/ihk/{cokernel,ikc}" from mckernel/kernel
2016-12-01 16:44:01 +09:00
a1035a1878 fix out of tree build 2016-12-01 12:55:34 +09:00
db169c5f90 add gcc options (-ffreestanding -fno-tree-loop-distribute-patterns)
refs #299
2016-11-29 16:28:18 +09:00
bbb55ef261 sched_setparam: thread lock is necessary when update other thread data 2016-11-28 14:04:44 +09:00
1130cafe41 ptrace: fixed for threads. 2016-11-28 11:19:30 +09:00
a1cf27e232 sched_getaffinity(): fix error code for special invalid input 2016-11-28 05:50:01 +09:00
5a1ce99d87 mcexec: fix number of threads not to exceed thread_data array 2016-11-27 07:31:52 +09:00
c7db296e1b getcpu(): expose correct NUMA id 2016-11-26 09:29:09 +09:00
f634a750c5 sched_{set/get}affinity(): fix error codes (also fixes KMP_AFFINITY behavior) 2016-11-24 21:25:16 +09:00
d07a196c8e mcexec: enable the same number of threads as CPU cores 2016-11-24 16:40:52 +09:00
8c56c75d2c process_vm_read_writev(): fix base address check for EFAULT 2016-11-24 10:40:41 +09:00
e54895efde set_mempolicy(): debug msg 2016-11-23 08:53:26 +09:00
2f8cca2d6d memcpy(): faster version using ASM rep; movsl 2016-11-23 08:51:22 +09:00
64607152ee VM: introduction of range lookup cache 2016-11-23 08:48:44 +09:00
20383ad3d0 do_process_vm_read_writev(): page size awareness optimization 2016-11-23 08:47:32 +09:00
787d34f650 introduction of ihk_mc_pt_virt_to_phys_size() 2016-11-23 08:40:33 +09:00
ae618a0c68 mcexec: remount /proc in mcexec's file NS after exec() 2016-11-22 13:22:59 +09:00
f480376153 mcoverlayfs: supported Linux kernel 4.6
add mcoverlayfs(linux-4.6.7 base)
2016-11-17 18:09:27 +09:00
e4b3a88fc6 mcexec_sys_umount(): remove debug print 2016-11-10 15:05:45 +09:00
69a5c53074 NUMA: hide non-existing nodes from /sys/devices/system/node listing 2016-11-05 16:12:08 +09:00
259583e936 mcreboot-smp-x86.sh: more white out of invalid NUMA info 2016-11-05 13:35:53 +09:00
0f826290d0 NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation 2016-11-05 13:32:02 +09:00
e46f027894 mcexec/mcctrl: unmount cgroups (privately) which expose invalid NUMA info 2016-11-04 17:02:48 +09:00
3e093f6a40 sysfs: fix /sys/devices/system/node/online value 2016-11-03 16:10:29 +09:00
00996b551f mcreboot: white out non-existing NUMA information 2016-11-03 16:09:27 +09:00
24d8697cef mcexec: workaround for overlayed /sys FS directory lseek() bug
lseek() on directories under /sys filesystem that are part of an
overlayed filesystem behave differently than in the original /sys.
This causes segfault in libnuma when discovering topology
information. The patch fakes return value as it is supposed to be,
which also fixes the Intel MPI 2017 MPI_Init() crash.
2016-11-03 13:41:25 +09:00
be4f6741f9 sysfs: fix /sys/devices/system/cpu/cpuXX/online value 2016-11-03 13:39:21 +09:00
7a2f67f5f0 sysfs: eliminate unnecessary new line from /sys/devices/system/node/nodeX/distance 2016-11-03 13:37:53 +09:00
bba0425267 sysfs: fix /sys/devices/system/cpu/online value 2016-11-03 13:36:29 +09:00
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
f74dcfc2a1 Modify mcreboot.sh for job scheduler
1. Don't complain when logname command doesn't exist
2016-09-01 19:27:18 +09:00
7c562d0539 support madvise(MADV_DONTFORK) 2016-09-01 11:22:53 +09:00
b5e4459a34 support AVX-512 registers 2016-08-30 18:39:33 +09:00
782122b681 mcctrl: fix to rus_vm_fault() call by kworker process 2016-08-22 13:00:28 +09:00
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
1ce1b17a85 Specify facility used by mcklogd via option
1. You can specify facility through -f option of mcreboot.sh.
   Example:
   mcreboot.sh -k 1 -f LOG_LOCAL6
   Note that you need to specify "-k 1" or "-k 2" to start mcklogd.
2. Kill mcklogd if needed in mcreboot.sh and mcstop+release.sh.
2016-08-17 17:52:44 +09:00
a2456c3ed2 Modify mcstop+release.sh for job scheduler
1. Remove ihk.ko
2. Output message to stderr and return one on error
2016-08-17 17:32:06 +09:00
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
15783f09a0 Modify mcreboot.sh for job scheduler
1. Add an option to specify owner of device files
2. Output message to stderr and return one on error
2016-08-17 15:07:13 +09:00
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
dac6f2883e mcctrl procfs: use semaphores instead of spinlocks to avoid sleeping in GFP_KERNEL kmalloc() in atomic context 2016-08-01 20:33:51 +09:00
c484f766fa schedule(): schedule a sleeping processes if it has pending signals 2016-07-28 11:42:00 +09:00
57690479bd read/patch_process_vm(): map non-LWK physical addresses properly 2016-07-22 20:48:54 +09:00
d0539a9cac eclair: make idle threads visible 2016-07-22 18:06:11 +09:00
4c8f583c0c split_large_page(): avoid panic when splitting "non-mapped" large pages 2016-07-14 17:11:52 +09:00
6118faffa9 pager_req_pfn(): use FAULT_FLAG_USER only if defined 2016-07-13 18:05:31 +09:00
dad6470c60 clone_thread: fork(2) copy sigstack infos from parent 2016-07-13 16:15:01 +09:00
46c37fc8f3 setfsgid: fix to didn't change fsgid 2016-07-13 15:54:52 +09:00
f6908f21a8 do_kill: wake PS_INTERRUPTIBLE process when send SIGKILL
sched_wakeup_thread: don't change process status if process status is PS_EXITED
2016-07-13 14:06:32 +09:00
01d9d9a5ba devobj: allow arbitrary size device file mappings 2016-07-12 17:02:19 +09:00
c43d993a4d mcstop+release-smp-x86.sh.in: unload mcctrl after OS shutdown 2016-07-11 16:40:06 +09:00
7d9bbecd7a mcctrl: use IHK OS notifiers to establish/tear down syscall channels
This patch eliminates the need for rmmod/insmod the mcctrl module
every time an OS instance is rebooted.
2016-07-11 16:22:50 +09:00
d135731398 do_syscall(): allow schedule for another thread (Intel MPI+OpenMP issue) 2016-07-05 18:54:51 +09:00
5c190beb04 save fpregs when to call sighandler
refs #50
2016-07-05 15:26:00 +09:00
fc66556f9f mcexec: error handling and propagation 2016-06-24 15:35:38 -07:00
648bacc90f device file mappings: communicate map flags and fault missing translations 2016-06-24 12:44:59 -07:00
dd37443fc7 PAPI support: performance counter's overflow.
and support mckfd fcntl.
2016-06-24 13:50:12 +09:00
e34322702a x86_init_perfctr: discover perf counters dynamically from MSRs 2016-06-22 10:47:57 -07:00
e12997e6a9 mcreboot: support for CPU cores (-c) and memory (-m) arguments 2016-06-21 09:10:06 -07:00
fabaa806d3 Revert "Make executor code include executer/config.h": breaks out-of-tree compile
This reverts commit d90900b6e6.
2016-06-21 08:51:45 +09:00
a83ad620c8 devobj: allow read only device file mappings (OFED 3.3 support) 2016-06-21 06:57:59 +09:00
86 changed files with 15546 additions and 2510 deletions

View File

@ -1,5 +1,6 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@ -48,6 +49,9 @@ install::
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \

View File

@ -148,7 +148,7 @@ extern char page_fault[], general_protection_exception[];
extern char debug_exception[], int3_exception[];
uint64_t boot_pat_state = 0;
int no_turbo = 0; /* May be updated by early parsing of kargs */
int no_turbo = 1; /* May be updated by early parsing of kargs */
extern int num_processors; /* kernel/ap.c */
struct pvclock_vsyscall_time_info *pvti = NULL;
@ -181,6 +181,8 @@ static void init_idt(void)
}
static int xsave_available = 0;
static int xsave_size = 0;
static uint64_t xsave_mask = 0x0;
void init_fpu(void)
{
@ -224,6 +226,26 @@ void init_fpu(void)
xsetbv(0, reg);
dkprintf("init_fpu(): AVX init: XCR0 = 0x%016lX\n", reg);
}
if(xsave_available){
unsigned long eax;
unsigned long ebx;
unsigned long ecx;
unsigned long edx;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (0x0d), "c" (0x00));
xsave_size = ecx;
dkprintf("init_fpu(): xsave_size = %d\n", xsave_size);
if ((eax & (1 << 5)) && (eax & (1 << 6)) && (eax & (1 << 7))) {
/* Set xcr0[7:5] to enable avx-512 ops */
reg = xgetbv(0);
reg |= 0xe6;
xsetbv(0, reg);
dkprintf("init_fpu(): AVX-512 init: XCR0 = 0x%016lX\n", reg);
}
}
xsave_mask = xgetbv(0);
dkprintf("init_fpu(): xsave_mask = 0x%016lX\n", xsave_mask);
/* TODO: set MSR_IA32_XSS to enable xsaves/xrstors */
@ -234,6 +256,17 @@ void init_fpu(void)
asm volatile("finit");
}
int
get_xsave_size()
{
return xsave_size;
}
uint64_t get_xsave_mask()
{
return xsave_mask;
}
void reload_gdt(struct x86_desc_ptr *gdt_ptr)
{
asm volatile("pushq %1\n"
@ -883,13 +916,36 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
}
else if (vector == LOCAL_PERF_VECTOR) {
struct siginfo info;
unsigned long value;
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
long irqstate;
struct mckfd *fdp;
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
value = rdmsr(MSR_PERF_GLOBAL_STATUS);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0);
//TODO: counter overflow signal
//set_signal(0x1d, regs, NULL); // SIGIO
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
for(fdp = proc->mckfd; fdp; fdp = fdp->next) {
if(fdp->sig_no > 0)
break;
}
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
if(fdp) {
memset(&info, '\0', sizeof info);
info.si_signo = fdp->sig_no;
info._sifields._sigfault.si_addr = (void *)regs->gpr.rip;
info._sifields._sigpoll.si_fd = fdp->fd;
set_signal(fdp->sig_no, regs, &info);
}
else {
set_signal(SIGIO, regs, NULL);
}
}
else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START &&
vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) {
@ -998,9 +1054,8 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
irqflags = kprintf_lock();
dkprintf("[%d] Page fault for 0x%lX\n",
ihk_mc_get_processor_id(), address);
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
__kprintf("Page fault for 0x%lx\n", address);
__kprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
@ -1012,18 +1067,22 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
dkprintf("address is in range, flag: 0x%X! \n",
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
dkprintf("address is out of range! \n");
__kprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
if (!(error & PF_USER)) {
panic("panic: kernel mode PF");
}
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
@ -1494,7 +1553,8 @@ release_fp_regs(struct thread *thread)
if (thread && !thread->fp_regs)
return;
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("release_fp_regs: pages=%d\n", pages);
ihk_mc_free_pages(thread->fp_regs, pages);
thread->fp_regs = NULL;
}
@ -1508,7 +1568,8 @@ save_fp_regs(struct thread *thread)
int pages;
if (!thread->fp_regs) {
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("save_fp_regs: pages=%d\n", pages);
thread->fp_regs = ihk_mc_alloc_pages(pages, IHK_MC_AP_NOWAIT);
if (!thread->fp_regs) {
@ -1517,14 +1578,15 @@ save_fp_regs(struct thread *thread)
}
memset(thread->fp_regs, 0, sizeof(fp_regs_struct));
memset(thread->fp_regs, 0, pages * PAGE_SIZE);
}
if (xsave_available) {
unsigned int low, high;
/* Request full save of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full save of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xsave %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high)
: "memory");
@ -1546,9 +1608,9 @@ restore_fp_regs(struct thread *thread)
if (xsave_available) {
unsigned int low, high;
/* Request full restore of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full restore of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xrstor %0" : : "m" (*thread->fp_regs),
"a" (low), "d" (high));
@ -1678,7 +1740,7 @@ int arch_setup_pvclock(void)
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
pvti = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
@ -1708,44 +1770,6 @@ void arch_start_pvclock(void)
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {

View File

@ -13,6 +13,8 @@
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;

View File

@ -306,7 +306,7 @@ struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);
@ -318,5 +318,5 @@ extern unsigned long ap_trampoline;
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif

View File

@ -0,0 +1,23 @@
#ifndef _ASM_X86_STRING_H
#define _ASM_X86_STRING_H
#define ARCH_FAST_MEMCPY
static inline void *__inline_memcpy(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
: "memory");
return to;
}
#endif

View File

@ -66,7 +66,7 @@ SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_HANDLED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_HANDLED(96, gettimeofday)
@ -150,5 +150,8 @@ SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef TRACK_SYSCALLS
SYSCALL_HANDLED(701, syscall_offload_clr_cntrs)
#endif // TRACK_SYSCALLS
/**** End of File ****/

View File

@ -23,6 +23,7 @@
#include <process.h>
#include <page.h>
#include <cls.h>
#include <kmalloc.h>
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
@ -30,11 +31,10 @@
static char *last_page;
extern char _head[], _end[];
static struct ihk_mc_pa_ops *pa_ops;
extern unsigned long x86_kernel_phys_base;
void *early_alloc_page(void)
/* Arch specific early allocation routine */
void *early_alloc_pages(int nr_pages)
{
void *p;
@ -47,57 +47,32 @@ void *early_alloc_page(void)
panic("Early allocator is already finalized. Do not use it.\n");
}
p = last_page;
last_page += PAGE_SIZE;
last_page += (nr_pages * PAGE_SIZE);
return p;
}
void *arch_alloc_page(enum ihk_mc_ap_flag flag)
void early_alloc_invalidate(void)
{
if (pa_ops)
return pa_ops->alloc_page(1, PAGE_P2ALIGN, flag);
else
return early_alloc_page();
}
void arch_free_page(void *ptr)
{
if (pa_ops)
pa_ops->free_page(ptr, 1);
last_page = (void *)-1;
}
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag)
void *ihk_mc_allocate(int size, int flag)
{
if (pa_ops)
return pa_ops->alloc_page(npages, p2align, flag);
else
if (!cpu_local_var(kmalloc_initialized)) {
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
return NULL;
}
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag)
{
return ihk_mc_alloc_aligned_pages(npages, PAGE_P2ALIGN, flag);
}
void ihk_mc_free_pages(void *p, int npages)
{
if (pa_ops)
pa_ops->free_page(p, npages);
}
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag)
{
if (pa_ops && pa_ops->alloc)
return pa_ops->alloc(size, flag);
else
return ihk_mc_alloc_pages(1, flag);
}
return kmalloc(size, IHK_MC_AP_NOWAIT);
}
void ihk_mc_free(void *p)
{
if (pa_ops && pa_ops->free)
return pa_ops->free(p);
else
return ihk_mc_free_pages(p, 1);
if (!cpu_local_var(kmalloc_initialized)) {
kprintf("%s: error, kmalloc not yet initialized\n", __FUNCTION__);
return;
}
kfree(p);
}
void *get_last_early_heap(void)
@ -172,7 +147,7 @@ static unsigned long setup_l3(struct page_table *pt,
pt->entry[i] = 0;
continue;
}
pt_phys = setup_l2(arch_alloc_page(IHK_MC_AP_CRITICAL), phys, start, end);
pt_phys = setup_l2(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys, start, end);
pt->entry[i] = pt_phys | PFL3_PDIR_ATTR;
}
@ -196,7 +171,7 @@ static void init_normal_area(struct page_table *pt)
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
phys += PTL4_SIZE) {
pt_phys = setup_l3(arch_alloc_page(IHK_MC_AP_CRITICAL), phys,
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
map_start, map_end);
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
@ -206,7 +181,7 @@ static void init_normal_area(struct page_table *pt)
static struct page_table *__alloc_new_pt(enum ihk_mc_ap_flag ap_flag)
{
struct page_table *newpt = arch_alloc_page(ap_flag);
struct page_table *newpt = ihk_mc_alloc_pages(1, ap_flag);
if(newpt)
memset(newpt, 0, sizeof(struct page_table));
@ -515,8 +490,10 @@ uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt)
return pagemap;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size)
{
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
@ -538,6 +515,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l3idx] & PFL3_SIZE)) {
*phys = pte_get_phys(&pt->entry[l3idx])
| (v & (PTL3_SIZE - 1));
if (size) *size = PTL3_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l3idx]));
@ -548,6 +526,7 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
if ((pt->entry[l2idx] & PFL2_SIZE)) {
*phys = pte_get_phys(&pt->entry[l2idx])
| (v & (PTL2_SIZE - 1));
if (size) *size = PTL2_SIZE;
return 0;
}
pt = phys_to_virt(pte_get_phys(&pt->entry[l2idx]));
@ -557,9 +536,17 @@ int ihk_mc_pt_virt_to_phys(struct page_table *pt,
}
*phys = pte_get_phys(&pt->entry[l1idx]) | (v & (PTL1_SIZE - 1));
if (size) *size = PTL1_SIZE;
return 0;
}
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys)
{
return ihk_mc_pt_virt_to_phys_size(pt, virt, phys, NULL);
}
int ihk_mc_pt_print_pte(struct page_table *pt, void *virt)
{
int l4idx, l3idx, l2idx, l1idx;
@ -715,7 +702,7 @@ static void destroy_page_table(int level, struct page_table *pt)
}
}
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
return;
}
@ -910,11 +897,17 @@ static int split_large_page(pte_t *ptep, size_t pgsize)
*ptep = (virt_to_phys(pt) & PT_PHYSMASK) | PFL2_PDIR_ATTR;
if (phys_base != NOPHYS) {
page = phys_to_page(phys_base);
if (page && page_unmap(page)) {
kprintf("split_large_page:page_unmap:%p\n", page);
panic("split_large_page:page_unmap\n");
/* Do not do this check for large pages as they don't come from the zeroobj
* and are not actually mapped.
* TODO: clean up zeroobj as we don't really need it, anonymous mappings
* should be allocated for real */
if (pgsize != PTL2_SIZE) {
if (phys_base != NOPHYS) {
page = phys_to_page(phys_base);
if (pgsize != PTL2_SIZE && page && page_unmap(page)) {
kprintf("split_large_page:page_unmap:%p\n", page);
panic("split_large_page:page_unmap\n");
}
}
}
return 0;
@ -1103,8 +1096,9 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL1_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), 1);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL1_SIZE;
}
@ -1151,8 +1145,9 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL2_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL2_SIZE/PTL1_SIZE);
dkprintf("%s: freeing large page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL2_SIZE;
}
@ -1170,7 +1165,7 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1215,7 +1210,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
}
if (!(old & PFL3_FILEOFF) && args->free_physical) {
if (page && page_unmap(page)) {
if (!page || (page && page_unmap(page))) {
ihk_mc_free_pages(phys_to_virt(phys), PTL3_SIZE/PTL1_SIZE);
}
args->vm->currss -= PTL3_SIZE;
@ -1234,7 +1229,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
*ptep = PTE_NULL;
remote_flush_tlb_cpumask(args->vm, base,
ihk_mc_get_processor_id());
arch_free_page(pt);
ihk_mc_free_pages(pt, 1);
}
return 0;
@ -1585,7 +1580,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l2(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -1668,7 +1663,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l3(%lx,%lx,%lx): %d\n",
base, start, end, error, *ptep);
@ -1726,7 +1721,7 @@ retry:
error = 0;
out:
if (newpt) {
arch_free_page(newpt);
ihk_mc_free_pages(newpt, 1);
}
dkprintf("set_range_l4(%lx,%lx,%lx): %d %lx\n",
base, start, end, error, *ptep);
@ -2083,7 +2078,7 @@ static void init_vsyscall_area(struct page_table *pt)
void init_page_table(void)
{
check_available_page_size();
init_pt = arch_alloc_page(IHK_MC_AP_CRITICAL);
init_pt = ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL);
ihk_mc_spinlock_init(&init_pt_lock);
memset(init_pt, 0, sizeof(PAGE_SIZE));
@ -2100,27 +2095,27 @@ void init_page_table(void)
}
extern void __reserve_arch_pages(unsigned long, unsigned long,
void (*)(unsigned long, unsigned long, int));
void (*)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int))
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int))
{
/* Reserve Text + temporal heap */
cb(virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
cb(pa_allocator, virt_to_phys(_head), virt_to_phys(get_last_early_heap()), 0);
/* Reserve trampoline area to boot the second ap */
cb(ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
cb(pa_allocator, ap_trampoline, ap_trampoline + AP_TRAMPOLINE_SIZE, 0);
/* Reserve the null page */
cb(0, PAGE_SIZE, 0);
/* Micro-arch specific */
cb(pa_allocator, 0, PAGE_SIZE, 0);
/*
* Micro-arch specific
* TODO: this does nothing in SMP mode, update it for KNC if necessary
*/
__reserve_arch_pages(start, end, cb);
}
void ihk_mc_set_page_allocator(struct ihk_mc_pa_ops *ops)
{
last_page = (void *)-1;
pa_ops = ops;
}
unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;
@ -2261,13 +2256,18 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
if ((ustart < vm->region.user_start)
|| (vm->region.user_end <= ustart)
|| ((vm->region.user_end - ustart) < siz)) {
kprintf("%s: error: out of user range\n", __FUNCTION__);
return -EFAULT;
}
reason = PF_USER; /* page not present */
for (addr = ustart & PAGE_MASK; addr < uend; addr += PAGE_SIZE) {
if (!addr)
return -EINVAL;
error = page_fault_process_vm(vm, (void *)addr, reason);
if (error) {
kprintf("%s: error: PF for %p failed\n", __FUNCTION__, addr);
return error;
}
}
@ -2283,11 +2283,22 @@ int read_process_vm(struct process_vm *vm, void *kdst, const void *usrc, size_t
error = ihk_mc_pt_virt_to_phys(vm->address_space->page_table, from, &pa);
if (error) {
kprintf("%s: error: resolving physical address or %p\n", __FUNCTION__, from);
return error;
}
va = phys_to_virt(pa);
memcpy(to, va, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, to: %p, pa: %p,"
"cpsize: %d\n", __FUNCTION__, to, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(to, va, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(to, va, cpsize);
}
from += cpsize;
to += cpsize;
@ -2356,8 +2367,18 @@ int write_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
return error;
}
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, from: %p,"
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(va, from, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
}
from += cpsize;
to += cpsize;
@ -2413,8 +2434,18 @@ int patch_process_vm(struct process_vm *vm, void *udst, const void *ksrc, size_t
return error;
}
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
dkprintf("%s: pa is outside of LWK memory, from: %p,"
"pa: %p, cpsize: %d\n", __FUNCTION__, from, pa, cpsize);
va = ihk_mc_map_virtual(pa, 1, PTATTR_ACTIVE);
memcpy(va, from, cpsize);
ihk_mc_unmap_virtual(va, 1, 1);
}
else {
va = phys_to_virt(pa);
memcpy(va, from, cpsize);
}
from += cpsize;
to += cpsize;

View File

@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@ -23,22 +24,28 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler);
ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel);
/* Set boot parameter */

View File

@ -12,16 +12,29 @@
#include <errno.h>
#include <ihk/debug.h>
#include <registers.h>
#include <mc_perf_event.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
void x86_init_perfctr(void)
{
int i = 0;
unsigned long reg;
unsigned long value = 0;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
/* Do not do it on KVM */
if (running_on_kvm()) return;
@ -30,12 +43,41 @@ void x86_init_perfctr(void)
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Detect number of supported performance counters */
if (!perf_counters_discovered) {
/* See Table 35.2 - Architectural MSRs in Vol 3C */
op = 0x0a;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
X86_IA32_FIXED_PERF_COUNTERS_MASK =
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
X86_IA32_BASE_FIXED_PERF_COUNTERS;
perf_counters_discovered = 1;
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
}
/* Clear Fixed Counter Control */
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= 0xfffffffffffff000L;
wrmsr(MSR_PERF_FIXED_CTRL, value);
/* Clear Generic Counter Control */
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -63,12 +105,12 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, unsigned long val)
static int set_pmc_x86_direct(int counter, long val)
{
unsigned long cnt_bit = 0;
@ -76,6 +118,8 @@ static int set_pmc_x86_direct(int counter, unsigned long val)
return -EINVAL;
}
val &= 0x000000ffffffffff; // 40bit Mask
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
@ -102,7 +146,7 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
@ -183,6 +227,24 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
if(counter_mask >> 32 & 0x1) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x2) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 4);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x4) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
}
@ -190,7 +252,7 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
@ -210,6 +272,9 @@ int ihk_mc_perfctr_fixed_init(int counter, int mode)
set_val |= 1;
}
// enable PMI on overflow
set_val |= 1 << 3;
set_val <<= counter_idx * 4;
value |= set_val;
@ -223,7 +288,7 @@ int ihk_mc_perfctr_reset(int counter)
return set_pmc_x86_direct(counter, 0);
}
int ihk_mc_perfctr_set(int counter, unsigned long val)
int ihk_mc_perfctr_set(int counter, long val)
{
return set_pmc_x86_direct(counter, val);
}
@ -297,23 +362,33 @@ unsigned long ihk_mc_perfctr_read_msr(int counter)
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status)
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
{
int ret = -1;
int i = 0;
int ret = -1;
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(*type == PERF_TYPE_HARDWARE) {
switch(*config){
case PERF_COUNT_HW_INSTRUCTIONS :
*type = PERF_TYPE_RAW;
*config = 0x5300c0;
break;
default :
// Unexpected config
return -1;
}
}
else if(*type != PERF_TYPE_RAW) {
return -1;
}
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
pmc_status |= (1 << i);
break;
}
}
if(ret < 0){
return ret;
}
return ret;
return ret;
}

View File

@ -38,6 +38,8 @@ void set_signal(int sig, void *regs0, siginfo_t *info);
void check_signal(unsigned long rc, void *regs0, int num);
extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
extern int get_xsave_size();
extern uint64_t get_xsave_mask();
//#define DEBUG_PRINT_SC
@ -54,6 +56,7 @@ uintptr_t debug_constants[] = {
offsetof(struct cpu_local_var, current),
offsetof(struct cpu_local_var, runq),
offsetof(struct cpu_local_var, status),
offsetof(struct cpu_local_var, idle),
offsetof(struct thread, ctx),
offsetof(struct thread, sched_list),
offsetof(struct thread, proc),
@ -219,6 +222,7 @@ SYSCALL_DECLARE(rt_sigreturn)
struct x86_user_context *regs;
struct sigsp ksigsp;
struct sigsp *sigsp;
int xsavesize = get_xsave_size();
asm ("movq %%gs:(%1),%0"
: "=r"(regs)
@ -265,12 +269,31 @@ SYSCALL_DECLARE(rt_sigreturn)
check_signal(0, regs, 0);
check_need_resched();
}
if(ksigsp.fpregs && xsavesize){
void *fpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
if(fpregs){
uint64_t xsave_mask = get_xsave_mask();
unsigned int low = (unsigned int)xsave_mask;
unsigned int high = (unsigned int)(xsave_mask >> 32);
struct xsave_struct *kfpregs;
kfpregs = (void *)((((unsigned long)fpregs) + 63) & ~63);
if(copy_from_user(kfpregs, ksigsp.fpregs, xsavesize))
return -EFAULT;
asm volatile("xrstor %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
kfree(fpregs);
}
}
return sigsp->sigrc;
}
extern struct cpu_local_var *clv;
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern void interrupt_syscall(int all, int pid);
extern void interrupt_syscall(int pid, int tid);
extern int num_processors;
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
@ -521,14 +544,14 @@ void ptrace_report_signal(struct thread *thread, int sig)
int parent_pid;
struct siginfo info;
dkprintf("ptrace_report_signal,pid=%d\n", thread->proc->pid);
dkprintf("ptrace_report_signal, tid=%d, pid=%d\n", thread->tid, thread->proc->pid);
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
if(!(proc->ptrace & PT_TRACED)){
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
return;
}
proc->exit_status = sig;
thread->exit_status = sig;
/* Transition thread state */
proc->status = PS_TRACED;
thread->status = PS_TRACED;
@ -546,8 +569,8 @@ void ptrace_report_signal(struct thread *thread, int sig)
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_TRAPPED;
info._sifields._sigchld.si_pid = thread->proc->pid;
info._sifields._sigchld.si_status = thread->proc->exit_status;
info._sifields._sigchld.si_pid = thread->tid;
info._sifields._sigchld.si_status = thread->exit_status;
do_kill(cpu_local_var(current), parent_pid, -1, SIGCHLD, &info, 0);
/* Wake parent (if sleeping in wait4()) */
waitq_wakeup(&proc->parent->waitpid_q);
@ -672,10 +695,10 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
int orgsig;
int ptraceflag = 0;
struct mcs_rwlock_node_irqsave lock;
unsigned long irqstate;
struct mcs_rwlock_node_irqsave mcs_rw_node;
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
dkprintf("do_signal,pid=%d,sig=%d\n", proc->pid, sig);
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
orgsig = sig;
if((proc->ptrace & PT_TRACED) &&
@ -695,18 +718,20 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
rc = regs->gpr.rax;
}
irqstate = ihk_mc_spinlock_lock(&thread->sigcommon->lock);
mcs_rwlock_writer_lock(&thread->sigcommon->lock, &mcs_rw_node);
k = thread->sigcommon->action + sig - 1;
if(k->sa.sa_handler == SIG_IGN){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
return;
}
else if(k->sa.sa_handler){
unsigned long *usp; /* user stack */
struct sigsp ksigsp;
struct sigsp *sigsp;
int xsavesize = get_xsave_size();
unsigned long fpregs;
if((k->sa.sa_flags & SA_ONSTACK) &&
!(thread->sigstack.ss_flags & SS_DISABLE) &&
@ -719,7 +744,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
else{
usp = (unsigned long *)regs->gpr.rsp;
}
sigsp = ((struct sigsp *)usp) - 1;
fpregs = (unsigned long)usp - xsavesize;
sigsp = ((struct sigsp *)fpregs) - 1;
sigsp = (struct sigsp *)((unsigned long)sigsp & 0xfffffffffffffff0UL);
memset(&ksigsp, '\0', sizeof ksigsp);
@ -751,19 +777,43 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
ksigsp.restart = isrestart(num, rc, sig, k->sa.sa_flags & SA_RESTART);
if(num != 0 && rc == -EINTR && sig == SIGCHLD)
ksigsp.restart = 1;
if(xsavesize){
uint64_t xsave_mask = get_xsave_mask();
unsigned int low = (unsigned int)xsave_mask;
unsigned int high = (unsigned int)(xsave_mask >> 32);
void *_kfpregs = kmalloc(xsavesize + 64, IHK_MC_AP_NOWAIT);
struct xsave_struct *kfpregs;
if(!_kfpregs){
kfree(pending);
kfree(_kfpregs);
kprintf("do_signal,no space available\n");
terminate(0, sig);
return;
}
kfpregs = (void *)((((unsigned long)_kfpregs) + 63) & ~63);
memset(kfpregs, '\0', xsavesize);
asm volatile("xsave %0" : : "m"(*kfpregs), "a"(low), "d"(high) : "memory");
if(copy_to_user((void *)fpregs, kfpregs, xsavesize)){
kfree(pending);
kfree(_kfpregs);
kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig);
return;
}
ksigsp.fpregs = (void *)fpregs;
kfree(_kfpregs);
}
memcpy(&ksigsp.info, &pending->info, sizeof(siginfo_t));
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig);
return;
}
usp = (unsigned long *)sigsp;
usp--;
*usp = (unsigned long)k->sa.sa_restorer;
@ -777,7 +827,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
if(!(k->sa.sa_flags & SA_NODEFER))
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
if(regs->gpr.rflags & RFLAGS_TF){
struct siginfo info;
@ -803,7 +853,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
}
else
kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
mcs_rwlock_writer_unlock(&thread->sigcommon->lock, &mcs_rw_node);
switch (sig) {
case SIGSTOP:
case SIGTSTP:
@ -835,7 +885,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Wake up the parent who tried wait4 and sleeping */
waitq_wakeup(&proc->parent->waitpid_q);
dkprintf("do_signal,SIGSTOP,sleeping\n");
dkprintf("do_signal(): pid: %d, tid: %d SIGSTOP, sleeping\n",
proc->pid, thread->tid);
/* Sleep */
schedule();
dkprintf("SIGSTOP(): woken up\n");
@ -849,7 +900,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
/* Update thread state in fork tree */
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
proc->exit_status = SIGTRAP;
thread->exit_status = SIGTRAP;
proc->status = PS_TRACED;
thread->status = PS_TRACED;
mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
@ -903,11 +954,11 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
static struct sig_pending *
getsigpending(struct thread *thread, int delflag){
struct list_head *head;
ihk_spinlock_t *lock;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
int irqstate;
__sigset_t x;
int sig;
struct k_sigaction *k;
@ -916,8 +967,12 @@ getsigpending(struct thread *thread, int delflag){
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for(;;){
irqstate = ihk_mc_spinlock_lock(lock);
for(;;) {
if (delflag)
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list){
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
@ -926,17 +981,26 @@ getsigpending(struct thread *thread, int delflag){
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)){
if(!(pending->sigmask.__val[0] & w)){
if(delflag)
if(delflag)
list_del(&pending->list);
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
return pending;
}
}
}
ihk_mc_spinlock_unlock(lock, irqstate);
if (delflag)
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
else
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if(lock == &thread->sigpendinglock)
return NULL;
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
@ -984,22 +1048,25 @@ check_signal(unsigned long rc, void *regs0, int num)
}
}
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
return;
goto out;
}
if(regs != NULL && !interrupt_from_user(regs)) {
return;
goto out;
}
for(;;){
pending = getsigpending(thread, 1);
if(!pending) {
dkprintf("check_signal,queue is empty\n");
return;
goto out;
}
do_signal(rc, regs, thread, pending, num);
}
out:
return;
}
unsigned long
@ -1013,7 +1080,8 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
struct thread *tthread = NULL;
int i;
__sigset_t mask;
ihk_spinlock_t *savelock = NULL;
mcs_rwlock_lock_t *savelock = NULL;
struct mcs_rwlock_node mcs_rw_node;
struct list_head *head = NULL;
int rc;
unsigned long irqstate = 0;
@ -1197,7 +1265,7 @@ done:
doint = 0;
ihk_mc_spinlock_lock_noirq(savelock);
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
/* Put signal event even when handler is SIG_IGN or SIG_DFL
because target ptraced thread must call ptrace_report_signal
@ -1236,11 +1304,11 @@ done:
}
}
}
ihk_mc_spinlock_unlock_noirq(savelock);
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
cpu_restore_interrupt(irqstate);
if (doint && !(mask & tthread->sigmask.__val[0])) {
int cpuid = tthread->cpu_id;
int tid = tthread->tid;
int pid = tproc->pid;
int status = tthread->status;
@ -1251,12 +1319,12 @@ done:
}
if(!tthread->proc->nohost)
interrupt_syscall(pid, cpuid);
interrupt_syscall(pid, tid);
if (status != PS_RUNNING) {
if(sig == SIGKILL){
/* Wake up the target only when stopped by ptrace-reporting */
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED);
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
}
else if(sig == SIGCONT || ptracecont == 1){
/* Wake up the target only when stopped by SIGSTOP */
@ -1387,9 +1455,8 @@ SYSCALL_DECLARE(mmap)
goto out;
}
if ((addr < region->user_start)
|| (region->user_end <= addr)
|| ((region->user_end - addr) < len)) {
if ((flags & MAP_FIXED) && ((addr < region->user_start)
|| (region->user_end <= addr))) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
addr0, len0, prot, flags0, fd, off0);
error = -ENOMEM;
@ -1513,6 +1580,7 @@ static int vdso_get_vdso_info(void)
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
dkprintf("vdso_get_vdso_info()\n");
memset(&vdso, '\0', sizeof vdso);
vdso.busy = 1;
vdso.vdso_npages = 0;

View File

@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -3,37 +3,61 @@
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
INTERVAL=1
LOGMODE=0
while getopts :i:k: OPT
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
turbo=""
while getopts :ti:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value"
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value"
echo "invalid -i value" >&2
exit 1
fi
;;
@ -41,22 +65,123 @@ do
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value"
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value"
echo "invalid -k value" >&2
exit 1
fi
;;
*) echo "invalid option -${OPT}"
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
t) turbo="turbo"
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
mem="512M@0"
cpus=""
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl || echo "warning: failed to remove mcctrl" >&2
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
@ -65,13 +190,28 @@ minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi`
else
enable_mcoverlay=no
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
if [ ${linux_version_code} -ge 263680 -a ${linux_version_code} -lt 263936 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
@ -79,12 +219,10 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
@ -95,105 +233,243 @@ if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay"; exit; fi
if ! rmmod mcoverlay; then
echo "error: removing mcoverlay" >&2
error_exit "initial"
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
error_exit "initial"
fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
if ! insmod ${KMODDIR}/ihk.ko; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86"; exit; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "cpus_reserved"
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then
echo "error: inserting mcctrl.ko" >&2
error_exit "mem_reserved"
fi
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE} $turbo"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos"; exit; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc"; exit; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko"; exit; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc"; exit; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 1
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys"; exit; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL}
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -10,6 +10,7 @@
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
@ -17,31 +18,98 @@ mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit; fi
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if ! rmmod mcctrl; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Destroy all LWK instances
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed"; exit; fi
done
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory"; exit; fi
# Remove mcoverlay if loaded
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86"; exit; fi
if ! rmmod ihk_smp_x86; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi

39
configure vendored
View File

@ -632,6 +632,7 @@ ENABLE_MCOVERLAYFS
MANDIR
KERNDIR
KMODDIR
ETCDIR
SBINDIR
BINDIR
TARGET
@ -2921,6 +2922,7 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $
ac_compiler_gnu=$ac_cv_c_compiler_gnu
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
@ -3031,6 +3033,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -3113,6 +3118,31 @@ _ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_umount" >&5
$as_echo_n "checking System.map for symbol sys_umount... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_umount\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: not found" >&5
$as_echo "not found" >&6; }
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_sys_umount\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $mcctrl_result" >&5
$as_echo "$mcctrl_result" >&6; }
cat >>confdefs.h <<_ACEOF
#define MCCTRL_KSYM_sys_umount $mcctrl_addr
_ACEOF
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking System.map for symbol sys_unshare" >&5
$as_echo_n "checking System.map for symbol sys_unshare... " >&6; }
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " sys_unshare\$" | cut -d\ -f1`
@ -3881,12 +3911,14 @@ fi
ac_config_headers="$ac_config_headers executer/config.h"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/x86_64/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile kernel/Makefile kernel/Makefile.build arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
if test "x$enable_dcfa" = xyes; then :
@ -4590,6 +4622,9 @@ do
"executer/kernel/mcctrl/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/Makefile" ;;
"executer/kernel/mcctrl/arch/x86_64/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcctrl/arch/x86_64/Makefile" ;;
"executer/kernel/mcoverlayfs/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.0.9/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.0.9/Makefile" ;;
"executer/kernel/mcoverlayfs/linux-4.6.7/Makefile") CONFIG_FILES="$CONFIG_FILES executer/kernel/mcoverlayfs/linux-4.6.7/Makefile" ;;
"kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
"kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;;
"arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;;
@ -4599,6 +4634,8 @@ do
"arch/x86/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcstop+release-smp-x86.sh" ;;
"arch/x86/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-builtin-x86.sh" ;;
"arch/x86/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ;;
"arch/x86/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.service" ;;
"arch/x86/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.in" ;;
"kernel/Makefile.dcfa") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.dcfa" ;;
*) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;;

View File

@ -70,6 +70,7 @@ case $WITH_TARGET in
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
@ -146,6 +147,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -218,6 +222,7 @@ AC_DEFUN([MCCTRL_FIND_KSYM],[
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_umount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
@ -278,9 +283,11 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(CFLAGS)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
@ -298,6 +305,9 @@ AC_CONFIG_FILES([
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
@ -307,6 +317,8 @@ AC_CONFIG_FILES([
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
])
AS_IF([test "x$enable_dcfa" = xyes], [

View File

@ -51,6 +51,9 @@
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_umount, or 0 if exported */
#undef MCCTRL_KSYM_sys_umount
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare

View File

@ -41,6 +41,7 @@
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@ -49,7 +50,8 @@
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_DEBUG_LOG 0x40000000
@ -110,6 +112,13 @@ struct program_load_desc {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
@ -128,8 +137,17 @@ struct syscall_load_desc {
unsigned long size;
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -180,6 +198,10 @@ struct sys_mount_desc {
void *data;
};
struct sys_umount_desc {
char *dir_name;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};

View File

@ -1,5 +1,5 @@
#include <linux/version.h>
#include "../../../../config.h"
#include "../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
@ -100,8 +100,6 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
memset(vdso, 0, sizeof(*vdso));
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;

View File

@ -75,7 +75,7 @@ static int load_elf(struct linux_binprm *bprm
char buf[32];
int l;
int pass;
char pbuf[1024];
char *pbuf;
const char *path;
if(bprm->envc == 0)
@ -88,6 +88,11 @@ static int load_elf(struct linux_binprm *bprm
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
@ -96,8 +101,10 @@ static int load_elf(struct linux_binprm *bprm
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
@ -124,8 +131,10 @@ static int load_elf(struct linux_binprm *bprm
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
@ -199,21 +208,27 @@ static int load_elf(struct linux_binprm *bprm
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
@ -221,12 +236,14 @@ static int load_elf(struct linux_binprm *bprm
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
@ -236,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs
@ -255,7 +276,7 @@ void __init binfmt_mcexec_init(void)
insert_binfmt(&mcexec_format);
}
void __exit binfmt_mcexec_exit(void)
void binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

View File

@ -32,6 +32,8 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/version.h>
#include <linux/semaphore.h>
#include <linux/interrupt.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
@ -64,7 +66,18 @@ int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = sys_mount;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_umount
#if MCCTRL_KSYM_sys_umount
typedef int (*int_fn_char_star_int_t)(char *, int);
int (*mcctrl_sys_umount)(char *dir_name, int flags) =
(int_fn_char_star_int_t)
MCCTRL_KSYM_sys_umount;
#else // exported
int (*mcctrl_sys_umount)(char *dir_name, int flags) = sys_umount;
#endif
#endif
@ -75,33 +88,49 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
static long mcexec_prepare_image(ihk_os_t os,
struct program_load_desc * __user udesc)
{
struct program_load_desc desc, *pdesc;
struct program_load_desc *desc, *pdesc;
struct ikc_scd_packet isp;
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL;
int num_sections;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
printk("%s: error: copying program_load_desc\n",
__FUNCTION__);
kfree(desc);
return -EFAULT;
}
if (desc.num_sections <= 0 || desc.num_sections > 16) {
printk("# of sections: %d\n", desc.num_sections);
num_sections = desc->num_sections;
if (num_sections <= 0 || num_sections > 16) {
printk("# of sections: %d\n", num_sections);
return -EINVAL;
}
pdesc = kmalloc(sizeof(struct program_load_desc) +
sizeof(struct program_image_section)
* desc.num_sections, GFP_KERNEL);
memcpy(pdesc, &desc, sizeof(struct program_load_desc));
* num_sections, GFP_KERNEL);
memcpy(pdesc, desc, sizeof(struct program_load_desc));
if (copy_from_user(pdesc->sections, udesc->sections,
sizeof(struct program_image_section)
* desc.num_sections)) {
* num_sections)) {
kfree(desc);
kfree(pdesc);
return -EFAULT;
}
kfree(desc);
pdesc->pid = task_tgid_vnr(current);
if (reserve_user_space(usrdata, &pdesc->user_start, &pdesc->user_end)) {
@ -123,52 +152,48 @@ static long mcexec_prepare_image(ihk_os_t os,
}
pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %ld\n", *(long *)args);
dprintk("args: 0x%lX\n", (unsigned long)pdesc->args);
dprintk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %ld\n", *(long *)envs);
dprintk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
dprintk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu;
isp.arg = virt_to_phys(pdesc);
printk("# of sections: %d\n", pdesc->num_sections);
printk("%p (%lx)\n", pdesc, isp.arg);
dprintk("# of sections: %d\n", pdesc->num_sections);
dprintk("%p (%lx)\n", pdesc, isp.arg);
pdesc->status = 0;
mcctrl_ikc_send(os, pdesc->cpu, &isp);
wait_event_interruptible(usrdata->wq_prepare, pdesc->status);
while (wait_event_interruptible(usrdata->wq_prepare, pdesc->status) != 0);
if(pdesc->err < 0){
ret = pdesc->err;
goto free_out;
}
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC);
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
printk("ERROR: allocating per process data\n");
ret = -ENOMEM;
printk("ERROR: no per process data for PID %d\n", task_tgid_vnr(current));
ret = -EINVAL;
goto free_out;
}
ppd->pid = pdesc->pid;
/* Update rpgtable */
ppd->rpgtable = pdesc->rpgtable;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_add_tail(&ppd->list, &usrdata->per_proc_list);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
sizeof(struct program_image_section) * num_sections)) {
ret = -EFAULT;
goto free_out;
}
dprintk("%s: pid %d, rpgtable: 0x%lx added\n",
__FUNCTION__, ppd->pid, ppd->rpgtable);
ret = 0;
free_out:
@ -318,33 +343,42 @@ static long mcexec_start_image(ihk_os_t os,
struct program_load_desc * __user udesc,
struct file *file)
{
struct program_load_desc desc;
struct program_load_desc *desc;
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct handlerinfo *info;
if (copy_from_user(&desc, udesc,
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
printk("%s: error: allocating program_load_desc\n",
__FUNCTION__);
return -ENOMEM;
}
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
kfree(desc);
return -EFAULT;
}
info = kmalloc(sizeof(struct handlerinfo), GFP_KERNEL);
info->pid = desc.pid;
info->pid = desc->pid;
ihk_os_register_release_handler(file, release_handler, info);
c = usrdata->channels + desc.cpu;
c = usrdata->channels + desc->cpu;
mcctrl_ikc_set_recv_cpu(os, desc.cpu);
mcctrl_ikc_set_recv_cpu(os, desc->cpu);
usrdata->last_thread_exec = desc.cpu;
usrdata->last_thread_exec = desc->cpu;
isp.msg = SCD_MSG_SCHEDULE_PROCESS;
isp.ref = desc.cpu;
isp.arg = desc.rprocess;
isp.ref = desc->cpu;
isp.arg = desc->rprocess;
mcctrl_ikc_send(os, desc.cpu, &isp);
mcctrl_ikc_send(os, desc->cpu, &isp);
kfree(desc);
return 0;
}
@ -416,19 +450,211 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus;
}
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg)
static long mcexec_get_nodes(ihk_os_t os)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata || !usrdata->mem_info)
return -EINVAL;
return usrdata->mem_info->n_numa_nodes;
}
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
struct mcctrl_per_proc_data *ppd_iter;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
/* Check if data for this thread exists and add if not */
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ret = -EBUSY;
goto out;
}
}
list_add_tail(&ppd->hash, &ud->per_proc_data_hash[hash]);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
if (!ppd) {
ret = -EINVAL;
goto out;
}
list_del(&ppd->hash);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this process exists and return it */
read_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
read_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ppd;
}
/*
* Called indirectly from the IKC message handler.
*/
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
{
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
struct wait_queue_head_list_node *wqhln_alloc = NULL;
int pid = packet->pid;
unsigned long flags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d, "
"syscall nr: %lu\n",
__FUNCTION__, pid, packet->req.number);
return -1;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
__FUNCTION__,
packet->req.rtid,
packet->req.ttid,
packet->req.number);
/*
* Three scenarios are possible:
* - Find the designated thread if req->ttid is specified.
* - Find any available thread if req->ttid is zero.
* - Add a request element if no threads are available.
*/
flags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Is this a request for a specific thread? See if it's waiting */
if (unlikely(packet->req.ttid)) {
list_for_each_entry(wqhln_iter, &ppd->wq_list_exact, list) {
if (packet->req.ttid != task_pid_vnr(wqhln_iter->task))
continue;
/* Look up per-process wait queue head with pid */
flags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == pid) {
wqhln = wqhln_iter;
break;
}
if (!wqhln) {
printk("%s: WARNING: no target thread found for exact request??\n",
__FUNCTION__);
}
}
/* Is there any thread available? */
else {
list_for_each_entry(wqhln_iter, &ppd->wq_list, list) {
if (wqhln_iter->task && !wqhln_iter->req) {
wqhln = wqhln_iter;
break;
}
}
}
/* If no match found, add request to pending request list */
if (unlikely(!wqhln)) {
retry_alloc:
wqhln_alloc = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln_alloc) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln = wqhln_alloc;
wqhln->req = 0;
wqhln->task = NULL;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &ppd->wq_req_list);
}
wqhln->packet = packet;
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
return 0;
}
/*
* Called from an mcexec thread via ioctl().
*/
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (packet) {
printk("%s: ERROR: packet %p is already registered for thread %d\n",
__FUNCTION__, packet, task_pid_vnr(current));
return -EBUSY;
}
retry:
/* Prepare per-thread wait queue head or find a valid request */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* First see if there is a valid request already that is not yet taken */
list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) {
if (wqhln_iter->task == NULL && wqhln_iter->req) {
wqhln = wqhln_iter;
wqhln->task = current;
list_del(&wqhln->list);
break;
}
}
if (!wqhln) {
@ -439,180 +665,86 @@ retry_alloc:
goto retry_alloc;
}
wqhln->pid = pid;
wqhln->task = current;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
/* Wait for a request.. */
list_add(&wqhln->list, &ppd->wq_list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
}
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
return 0;
}
#ifndef DO_USER_MODE
// static int remaining_job, base_cpu, job_pos;
#endif
// extern int num_channels;
// extern int mcctrl_dma_abort;
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct syscall_wait_desc swd;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
#ifndef DO_USER_MODE
unsigned long s, w, d;
#endif
//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu));
if (copy_from_user(&swd, req, sizeof(swd))) {
return -EFAULT;
}
if (swd.cpu >= usrdata->num_channels)
return -EINVAL;
c = get_peer_channel(usrdata, current);
if (c) {
printk("mcexec_wait_syscall:already registered. task %p ch %p\n",
current, c);
return -EBUSY;
}
c = usrdata->channels + swd.cpu;
#ifdef DO_USER_MODE
retry:
/* Prepare per-process wait queue head */
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = swd.pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &c->wq_list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
if (ret && !wqhln->req) {
kfree(wqhln);
wqhln = NULL;
return -EINTR;
}
packet = wqhln->packet;
kfree(wqhln);
wqhln = NULL;
if (c->param.request_va->number == 61 &&
c->param.request_va->args[0] == swd.pid) {
dprintk("%s: tid: %d request from CPU %d\n",
__FUNCTION__, task_pid_vnr(current), packet->ref);
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
task_tgid_vnr(current),
task_pid_vnr(current);
c->param.request_va->number,
swd.cpu);
return -EINTR;
}
#if 1
mb();
if (!c->param.request_va->valid) {
printk("mcexec_wait_syscall:stray wakeup\n");
if (!packet->req.valid) {
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
__FUNCTION__,
task_tgid_vnr(current),
task_pid_vnr(current),
packet->req.number);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
goto retry;
}
#endif
#else
while (1) {
c = usrdata->channels + swd.cpu;
ihk_get_tsc(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
ihk_get_tsc(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
}
d = (*c->param.doorbell_va) - 1;
*c->param.doorbell_va = 0;
if (d < 0 || d >= usrdata->num_channels) {
d = 0;
}
usrdata->base_cpu = d;
usrdata->job_pos = 0;
usrdata->remaining_job = 1;
} else {
usrdata->job_pos++;
}
for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) {
if (base_cpu + job_pos >= num_channels) {
c = usrdata->channels +
(usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels);
} else {
c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos;
}
if (!c) {
continue;
}
if (c->param.request_va &&
c->param.request_va->valid) {
#endif
c->param.request_va->valid = 0; /* ack */
dprintk("SC #%lx, %lx\n",
c->param.request_va->number,
c->param.request_va->args[0]);
register_peer_channel(usrdata, current, c);
if (__do_in_kernel_syscall(os, c, c->param.request_va)) {
if (copy_to_user(&req->sr, c->param.request_va,
sizeof(struct syscall_request))) {
deregister_peer_channel(usrdata, current, c);
return -EFAULT;
}
return 0;
}
deregister_peer_channel(usrdata, current, c);
#ifdef DO_USER_MODE
goto retry;
#endif
#ifndef DO_USER_MODE
if (usrdata->mcctrl_dma_abort) {
return -2;
}
}
}
usrdata->remaining_job = 0;
packet->req.valid = 0; /* ack */
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
__FUNCTION__,
packet->req.number,
packet->req.args[0],
packet->req.args[1],
packet->req.args[2],
packet->req.args[3],
packet->req.args[4],
packet->req.args[5]);
if (mcctrl_add_per_thread_data(ppd, current, packet) < 0) {
kprintf("%s: error adding per-thread data\n", __FUNCTION__);
return -EINVAL;
}
#endif
return 0;
if (__do_in_kernel_syscall(os, packet)) {
if (copy_to_user(&req->sr, &packet->req,
sizeof(struct syscall_request))) {
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
return -EFAULT;
}
return 0;
}
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
goto retry;
}
long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg)
@ -695,33 +827,6 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size);
/*
ihk_dma_channel_t channel;
struct ihk_dma_request request;
unsigned long dma_status = 0;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = os;
request.src_phys = desc.src;
request.dest_os = NULL;
request.dest_phys = desc.dest;
request.size = desc.size;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
*/
return 0;
}
@ -729,80 +834,66 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
{
struct syscall_ret_desc ret;
struct mcctrl_channel *mc;
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
#if 0
ihk_dma_channel_t channel;
struct ihk_dma_request request;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
struct mcctrl_per_proc_data *ppd;
if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) {
return -EFAULT;
}
mc = usrdata->channels + ret.cpu;
if (!mc) {
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
deregister_peer_channel(usrdata, current, mc);
mc->param.response_va->ret = ret.ret;
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
kprintf("%s: ERROR: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
return -EINVAL;
}
mcctrl_delete_per_thread_data(ppd, current);
if (ret.size > 0) {
/* Host => Accel. Write is fast. */
unsigned long phys;
void *rpm;
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest,
ret.size);
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, ret.size);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, ret.size);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
ret.size, NULL, 0);
#endif
if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) {
return -EFAULT;
}
mb();
mc->param.response_va->status = 1;
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size);
}
/*
memset(&request, 0, sizeof(request));
request.src_os = NULL;
request.src_phys = ret.src;
request.dest_os = os;
request.dest_phys = ret.dest;
request.size = ret.size;
request.notify_os = os;
request.notify = (void *)mc->param.response_rpa;
request.priv = (void *)1;
ihk_dma_request(channel, &request);
*/
} else {
mb();
mc->param.response_va->status = 1;
}
__return_syscall(os, packet, ret.ret, task_pid_vnr(current));
/* Free packet */
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
return 0;
}
LIST_HEAD(mckernel_exec_files);
DEFINE_SPINLOCK(mckernel_exec_file_lock);
DEFINE_SEMAPHORE(mckernel_exec_file_lock);
struct mckernel_exec_file {
@ -861,14 +952,53 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
int retval;
int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int i;
if (os_ind < 0) {
return EINVAL;
}
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
ppd = kmalloc(sizeof(*ppd), GFP_KERNEL);
if (!ppd) {
printk("ERROR: allocating per process data\n");
return -ENOMEM;
}
ppd->pid = task_tgid_vnr(current);
/*
* XXX: rpgtable will be updated in __do_in_kernel_syscall()
* under case __NR_munmap
*/
INIT_LIST_HEAD(&ppd->wq_list);
INIT_LIST_HEAD(&ppd->wq_req_list);
INIT_LIST_HEAD(&ppd->wq_list_exact);
spin_lock_init(&ppd->wq_list_lock);
for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&ppd->per_thread_data_hash[i]);
rwlock_init(&ppd->per_thread_data_hash_lock[i]);
}
if (mcctrl_add_per_proc_data(usrdata, ppd->pid, ppd) < 0) {
printk("%s: error adding per process data\n", __FUNCTION__);
retval = EINVAL;
goto out_free_ppd;
}
}
else {
/* Only deallocate in case of an error if we added it above */
ppd = NULL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
return ENOMEM;
retval = ENOMEM;
goto out_error_drop_ppd;
}
file = open_exec(filename);
@ -889,7 +1019,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
goto out_put_file;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
@ -900,7 +1030,7 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
break;
}
}
/* Add new exec file to the list */
mcef->os = os;
mcef->pid = task_tgid_vnr(current);
@ -910,19 +1040,22 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
spin_unlock(&mckernel_exec_file_lock);
up(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current), filename);
kfree(pathbuf);
return 0;
out_put_file:
fput(file);
out_error_free:
kfree(pathbuf);
out_error_drop_ppd:
if (ppd) mcctrl_delete_per_proc_data(usrdata, ppd->pid);
out_free_ppd:
if (ppd) kfree(ppd);
return -retval;
}
@ -932,12 +1065,29 @@ int mcexec_close_exec(ihk_os_t os)
struct mckernel_exec_file *mcef = NULL;
int found = 0;
int os_ind = ihk_host_os_get_index(os);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (ppd) {
mcctrl_delete_per_proc_data(usrdata, ppd->pid);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
task_tgid_vnr(current));
}
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == task_tgid_vnr(current)) {
allow_write_access(mcef->fp);
@ -950,7 +1100,7 @@ int mcexec_close_exec(ihk_os_t os)
}
}
spin_unlock(&mckernel_exec_file_lock);
up(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
}
@ -1025,7 +1175,7 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
#ifdef MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
@ -1038,6 +1188,36 @@ long mcexec_sys_mount(struct sys_mount_desc *__user arg)
return ret;
}
long mcexec_sys_umount(struct sys_mount_desc *__user arg)
{
struct sys_umount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#ifdef MCCTRL_KSYM_sys_umount
ret = mcctrl_sys_umount(desc.dir_name, MNT_FORCE);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
@ -1096,6 +1276,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_CPU:
return mcexec_get_cpu(os);
case MCEXEC_UP_GET_NODES:
return mcexec_get_nodes(os);
case MCEXEC_UP_STRNCPY_FROM_USER:
return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg);
@ -1125,6 +1308,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UMOUNT:
return mcexec_sys_umount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);

View File

@ -60,6 +60,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
@ -69,6 +70,7 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
@ -82,79 +84,118 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
ihk_os_t
osnum_to_os(int n)
ihk_os_t osnum_to_os(int n)
{
return os[n];
}
static int __init mcctrl_init(void)
/* OS event notifier implementation */
int mcctrl_os_boot_notifier(int os_index)
{
int i;
int rc;
rc = -ENOENT;
for(i = 0; i < OS_MAX_MINOR; i++){
os[i] = ihk_host_find_os(i, NULL);
if (os[i]) {
printk("OS #%d found.\n", i);
rc = 0;
}
}
if(rc){
printk("OS not found.\n");
return rc;
os[os_index] = ihk_host_find_os(os_index, NULL);
if (!os[os_index]) {
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
return -EINVAL;
}
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
if (prepare_ikc_channels(os[i]) != 0) {
printk("Preparing syscall channels failed.\n");
os[i] = NULL;
}
}
if (prepare_ikc_channels(os[os_index]) != 0) {
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
os[os_index] = NULL;
return -EFAULT;
}
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
procfs_init(os_index);
printk("mcctrl: OS ID %d boot event handled\n", os_index);
return 0;
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
os[os_index] = NULL;
return rc;
}
int mcctrl_os_shutdown_notifier(int os_index)
{
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
}
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
.boot = mcctrl_os_boot_notifier,
.shutdown = mcctrl_os_shutdown_notifier,
};
static struct ihk_os_notifier mcctrl_os_notifier = {
.ops = &mcctrl_os_notifier_ops,
};
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
rus_page_hash_init();
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
if(rc < 0){
destroy_ikc_channels(os[i]);
os[i] = NULL;
}
procfs_init(i);
}
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();
return 0;
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
}
printk("mcctrl: initialized successfully.\n");
return ret;
error:
binfmt_mcexec_exit();
rus_page_hash_put_pages();
return ret;
}
static void __exit mcctrl_exit(void)
{
int i;
binfmt_mcexec_exit();
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
sysfsm_cleanup(os[i]);
free_topology_info(os[i]);
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
printk("mcctrl: warning: failed to deregister OS notifier??\n");
}
binfmt_mcexec_exit();
rus_page_hash_put_pages();
printk("mcctrl: unregistered.\n");
}
MODULE_LICENSE("GPL v2");

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h>
@ -34,22 +35,34 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) {
switch (msg) {
case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break;
@ -63,7 +76,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
mcexec_syscall(usrdata, pisp);
break;
case SCD_MSG_PROCFS_ANSWER:
@ -88,21 +101,14 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
@ -110,6 +116,14 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pisp->err, pisp->arg);
break;
}
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0;
}
@ -146,8 +160,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0;
}
@ -162,91 +174,26 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
@ -265,11 +212,8 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
}
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -286,11 +230,8 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -313,27 +254,29 @@ static struct ihk_ikc_listen_param listen_param2 = {
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
int error;
struct mcctrl_usrdata *usrdata;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("Error: cannot obtain OS CPU and memory information.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
if (usrdata->cpu_info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->num_channels = usrdata->cpu_info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
@ -347,36 +290,20 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
}
return 0;
}
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@ -384,6 +311,11 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
@ -394,9 +326,7 @@ void destroy_ikc_channels(ihk_os_t os)
printk("Channel #%d freed.\n", i);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
destroy_peer_channel_registry(usrdata);
kfree(usrdata->channels);
kfree(usrdata);
}

View File

@ -41,6 +41,7 @@
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include "sysfs.h"
@ -48,6 +49,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@ -57,8 +59,8 @@
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
@ -110,8 +112,9 @@ struct ikc_scd_packet {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
@ -120,7 +123,13 @@ struct ikc_scd_packet {
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
struct mcctrl_priv {
@ -154,24 +163,41 @@ struct syscall_params {
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
int pid;
struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req;
struct ikc_scd_packet *packet;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
};
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct list_head list;
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
};
struct sysfsm_req {
@ -199,11 +225,6 @@ static inline int sysfs_inited(struct sysfsm_data *sdp)
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
@ -212,8 +233,9 @@ struct cache_topology {
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
@ -221,8 +243,12 @@ struct cpu_topology {
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
@ -230,14 +256,16 @@ struct node_topology {
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
struct mcctrl_channel *channels;
int remaining_job;
int base_cpu;
int job_pos;
@ -245,15 +273,15 @@ struct mcctrl_usrdata {
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
@ -273,14 +301,24 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
#define PROCFS_NAME_MAX 1000
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
@ -301,6 +339,7 @@ struct procfs_file {
};
void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);

View File

@ -17,8 +17,10 @@
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include <linux/version.h>
#include <linux/semaphore.h>
//#define PROCFS_DEBUG
@ -81,7 +83,7 @@ struct procfs_list_entry {
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
DEFINE_SEMAPHORE(procfs_file_list_lock);
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
@ -375,67 +377,62 @@ _add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
void
add_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
delete_pid_entry(int osnum, int pid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
@ -451,7 +448,7 @@ proc_exe_link(int osnum, int pid, const char *path)
uid, gid, path);
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
/**
@ -463,14 +460,13 @@ void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
up(&procfs_file_list_lock);
}
/**
@ -481,14 +477,14 @@ procfs_init(int osnum)
void
procfs_exit(int osnum)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
if (e) {
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
up(&procfs_file_list_lock);
}
/**
@ -501,7 +497,7 @@ static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
@ -604,7 +600,7 @@ static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
@ -719,6 +715,57 @@ mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
return file->f_pos;
}
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,

File diff suppressed because it is too large Load Diff

View File

@ -14,10 +14,11 @@
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/version.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -1231,9 +1232,16 @@ sysfsm_cleanup(ihk_os_t os)
int error;
ihk_device_t dev = ihk_os_to_dev(os);
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_data *sdp = &udp->sysfsm_data;
struct sysfsm_data *sdp;
struct sysfsm_node *np;
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
sdp = &udp->sysfsm_data;
dprintk("mcctrl:sysfsm_cleanup(%p)\n", os);
if (sdp->sysfs_buf) {
@ -2094,9 +2102,16 @@ struct sysfsm_ops snooping_local_ops_s = {
/**** local list ****/
static ssize_t snooping_local_show_pbl(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnlistprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pbl() */
struct sysfsm_ops snooping_local_ops_pbl = {
@ -2107,9 +2122,16 @@ struct sysfsm_ops snooping_local_ops_pbl = {
/**** local map ****/
static ssize_t snooping_local_show_pb(struct sysfsm_ops *ops, void *instance, void *buf, size_t bufsize)
{
size_t ret;
const struct sysfsm_bitmap_param *p = instance;
return bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
ret = bitmap_scnprintf(buf, bufsize, p->ptr, p->nbits);
if (ret < bufsize - 1) {
sprintf(buf + ret, "\n");
return ret + 1;
}
return 0;
} /* snooping_local_show_pb() */
struct sysfsm_ops snooping_local_ops_pb = {

View File

@ -18,7 +18,7 @@
#include "mcctrl.h"
#include "sysfs_msg.h"
#define dprintk(...) do { if (0) printk(KERN_DEBUG __VA_ARGS__); } while (0)
#define dprintk(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define wprintk(...) do { if (1) printk(KERN_WARNING __VA_ARGS__); } while (0)
#define eprintk(...) do { if (1) printk(KERN_ERR __VA_ARGS__); } while (0)
@ -92,27 +92,19 @@ void setup_local_snooping_samples(ihk_os_t os)
void setup_local_snooping_files(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct sysfsm_bitmap_param param;
static unsigned long cpu_offline = 0x0;
int i;
int error;
info = ihk_os_get_cpu_info(os);
if (!info) {
eprintk("mcctrl:ihk_os_get_cpu_info failed.\n");
return;
}
memset(udp->cpu_online, 0, sizeof(udp->cpu_online));
for (i = 0; i < info->n_cpus; i++) {
udp->cpu_online[i / BITS_PER_LONG] =
udp->cpu_online[i / BITS_PER_LONG] | (1 << (i % BITS_PER_LONG));
for (i = 0; i < udp->cpu_info->n_cpus; i++) {
set_bit(i, udp->cpu_online);
}
param.nbits = CPU_LONGS * BITS_PER_LONG;
param.ptr = udp->cpu_online;
param.ptr = &udp->cpu_online;
dprintk("mcctrl:setup_local_snooping_files: CPU_LONGS=%d, BITS_PER_LONG=%d\n",
CPU_LONGS, BITS_PER_LONG);
@ -187,141 +179,122 @@ static void free_cpu_topology(struct mcctrl_usrdata *udp)
return;
} /* free_cpu_topology() */
static void free_cpu_mapping(struct mcctrl_usrdata *udp)
{
ihk_device_t dev = ihk_os_to_dev(udp->os);
size_t size;
size = udp->cpu_mapping_elems * sizeof(struct cpu_mapping);
ihk_device_unmap_virtual(dev, udp->cpu_mapping, size);
ihk_device_unmap_memory(dev, udp->cpu_mapping_pa, size);
return;
} /* free_cpu_mapping() */
void free_topology_info(ihk_os_t os)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
if (!udp) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
free_node_topology(udp);
free_cpu_topology(udp);
free_cpu_mapping(udp);
return;
} /* free_topology_info() */
void reply_get_cpu_mapping(long req_pa)
/*
* CPU and NUMA node mapping conversion functions.
*/
static int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
struct get_cpu_mapping_req *req = phys_to_virt(req_pa);
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->mapping[cpu_id] : -1;
}
req->busy = 0;
wake_up(&req->wq);
return;
} /* reply_get_cpu_mapping() */
static int get_cpu_mapping(struct mcctrl_usrdata *udp)
static int mckernel_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu_id)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
struct get_cpu_mapping_req *req = NULL;
struct ikc_scd_packet packet;
size_t size;
return (cpu_id < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[cpu_id] : -1;
}
dprintk("get_cpu_mapping(%p)\n", udp);
req = kmalloc(sizeof(*req), GFP_KERNEL);
if (!req) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_mapping:kmalloc failed. %d\n", error);
goto out;
}
req->busy = 1;
req->error = -1;
init_waitqueue_head(&req->wq);
packet.msg = SCD_MSG_GET_CPU_MAPPING;
packet.arg = virt_to_phys(req);
#define GET_CPU_MAPPING_CPU 0
error = mcctrl_ikc_send(udp->os, GET_CPU_MAPPING_CPU, &packet);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"mcctrl_ikc_send failed. %d\n", error);
goto out;
}
error = wait_event_interruptible(req->wq, !req->busy);
if (error) {
eprintk("mcctrl:get_cpu_mapping:"
"wait_event_interruptible failed. %d\n", error);
req = NULL; /* XXX */
goto out;
}
if (req->error) {
error = req->error;
eprintk("mcctrl:get_cpu_mapping:"
"SCD_MSG_GET_CPU_MAPPING failed. %d\n", error);
goto out;
}
size = req->buf_elems * sizeof(struct cpu_mapping);
udp->cpu_mapping_elems = req->buf_elems;
udp->cpu_mapping_pa = ihk_device_map_memory(dev, req->buf_rpa, size);
udp->cpu_mapping = ihk_device_map_virtual(
dev, udp->cpu_mapping_pa, size, NULL, 0);
error = 0;
out:
dprintk("get_cpu_mapping(%p): %d\n", udp, error);
kfree(req);
return error;
} /* get_cpu_mapping() */
static int hwid_to_cpu(struct mcctrl_usrdata *udp, int hw_id)
static int linux_cpu_2_mckernel_cpu(struct mcctrl_usrdata *udp, int cpu_id)
{
int i;
for (i = 0; i < udp->cpu_mapping_elems; ++i) {
if (udp->cpu_mapping[i].hw_id == hw_id) {
return udp->cpu_mapping[i].cpu_number;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->mapping[i] == cpu_id)
return i;
}
return -1;
}
#if 0
static int hw_id_2_mckernel_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return i;
}
}
return -1;
}
static int hw_id_2_linux_cpu(struct mcctrl_usrdata *udp, int hw_id)
{
int i;
for (i = 0; i < udp->cpu_info->n_cpus; ++i) {
if (udp->cpu_info->hw_ids[i] == hw_id) {
return mckernel_cpu_2_linux_cpu(udp, i);
}
}
return -1;
}
static int linux_cpu_2_hw_id(struct mcctrl_usrdata *udp, int cpu)
{
int mckernel_cpu = linux_cpu_2_mckernel_cpu(udp, cpu);
return (mckernel_cpu >= 0 && mckernel_cpu < udp->cpu_info->n_cpus) ?
udp->cpu_info->hw_ids[mckernel_cpu] : -1;
}
#endif
static int mckernel_numa_2_linux_numa(struct mcctrl_usrdata *udp, int numa_id)
{
return (numa_id < udp->mem_info->n_numa_nodes) ?
udp->mem_info->numa_mapping[numa_id] : -1;
}
static int linux_numa_2_mckernel_numa(struct mcctrl_usrdata *udp, int numa_id)
{
int i;
for (i = 0; i < udp->mem_info->n_numa_nodes; ++i) {
if (udp->mem_info->numa_mapping[i] == numa_id)
return i;
}
return -1;
}
static int translate_cpumap(struct mcctrl_usrdata *udp,
cpumask_t *linmap, cpumask_t *mckmap)
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
int lincpu;
int hw_id;
int mckcpu;
dprintk("translate_cpumap(%p,%p,%p)\n", udp, linmap, mckmap);
cpumask_clear(mckmap);
for_each_cpu(lincpu, linmap) {
hw_id = ihk_device_linux_cpu_to_hw_id(dev, lincpu);
if (hw_id < 0) {
error = hw_id;
eprintk("mcctrl:translate_cpumap:"
"ihk_device_linux_cpu_to_hw_id failed."
" %d\n", error);
goto out;
}
mckcpu = linux_cpu_2_mckernel_cpu(udp, lincpu);
mckcpu = hwid_to_cpu(udp, hw_id);
if (mckcpu >= 0) {
cpumask_set_cpu(mckcpu, mckmap);
}
}
error = 0;
out:
dprintk("translate_cpumap(%p,%p,%p): %d\n", udp, linmap, mckmap, error);
return error;
} /* translate_cpumap() */
@ -361,7 +334,7 @@ out:
return (error)? ERR_PTR(error): topo;
} /* get_cache_topology() */
static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
{
int error;
@ -370,41 +343,43 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
struct cache_topology *cache;
struct ihk_cache_topology *saved_cache;
dprintk("get_cpu_topology_one(%p,%d)\n", udp, index);
dprintk("get_one_cpu_topology(%p,%d)\n", udp, index);
topology = kmalloc(sizeof(*topology), GFP_KERNEL);
if (!topology) {
error = -ENOMEM;
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"kmalloc failed. %d\n", error);
goto out;
}
INIT_LIST_HEAD(&topology->cache_list);
topology->cpu_mapping = &udp->cpu_mapping[index];
topology->mckernel_cpu_id = index;
topology->saved = ihk_device_get_cpu_topology(dev,
mckernel_cpu_2_hw_id(udp, index));
topology->saved = ihk_device_get_cpu_topology(
dev, topology->cpu_mapping->hw_id);
if (IS_ERR(topology->saved)) {
error = PTR_ERR(topology->saved);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"ihk_device_get_cpu_topology failed. %d\n",
error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->core_siblings,
error = translate_cpumap(udp,
&topology->saved->core_siblings,
&topology->core_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(core_siblings) failed."
" %d\n", error);
goto out;
}
error = translate_cpumap(udp, &topology->saved->thread_siblings,
error = translate_cpumap(udp,
&topology->saved->thread_siblings,
&topology->thread_siblings);
if (error) {
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"translate_cpumap(thread_siblings) failed."
" %d\n", error);
goto out;
@ -415,7 +390,7 @@ static struct cpu_topology *get_cpu_topology_one(struct mcctrl_usrdata *udp,
cache = get_cache_topology(udp, topology, saved_cache);
if (IS_ERR(cache)) {
error = PTR_ERR(cache);
eprintk("mcctrl:get_cpu_topology_one:"
eprintk("mcctrl:get_one_cpu_topology:"
"get_cache_topology failed. %d\n",
error);
goto out;
@ -429,10 +404,10 @@ out:
if (error && !IS_ERR_OR_NULL(topology)) {
free_cpu_topology_one(udp, topology);
}
dprintk("get_cpu_topology_one(%p,%d): %d %p\n",
dprintk("get_one_cpu_topology(%p,%d): %d %p\n",
udp, index, error, topology);
return (error)? ERR_PTR(error): topology;
} /* get_cpu_topology_one() */
} /* get_one_cpu_topology() */
static int get_cpu_topology(struct mcctrl_usrdata *udp)
{
@ -441,12 +416,12 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
struct cpu_topology *topology;
dprintk("get_cpu_topology(%p)\n", udp);
for (index = 0; index < udp->cpu_mapping_elems; ++index) {
topology = get_cpu_topology_one(udp, index);
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
topology = get_one_cpu_topology(udp, index);
if (IS_ERR(topology)) {
error = PTR_ERR(topology);
eprintk("mcctrl:get_cpu_topology:"
"get_cpu_topology_one failed. %d\n",
eprintk("mcctrl:get_cpu_topology: "
"get_one_cpu_topology failed. %d\n",
error);
goto out;
}
@ -460,15 +435,15 @@ out:
return error;
} /* get_cpu_topology() */
static void setup_one_cache_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu, struct cache_topology *cache)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
int index = cache->saved->index;
struct sysfsm_bitmap_param param;
dprintk("setup_one_cache_files(%p,%p,%p)\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p)\n", udp, cpu, cache);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d64,
&cache->saved->level, 0444,
@ -509,19 +484,19 @@ static void setup_one_cache_files(struct mcctrl_usrdata *udp,
"%s/cpu%d/cache/index%d/shared_cpu_list",
prefix, cpu_number, index);
dprintk("setup_one_cache_files(%p,%p,%p):\n", udp, cpu, cache);
dprintk("setup_cpu_sysfs_cache_files(%p,%p,%p):\n", udp, cpu, cache);
return;
} /* setup_one_cache_files() */
} /* setup_cpu_sysfs_cache_files() */
static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->cpu_mapping->cpu_number;
int cpu_number = cpu->mckernel_cpu_id;
struct sysfsm_bitmap_param param;
struct cache_topology *cache;
dprintk("setup_one_cpu_files(%p,%p)\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p)\n", udp, cpu);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_d32,
&cpu->saved->physical_package_id, 0444,
@ -553,41 +528,61 @@ static void setup_one_cpu_files(struct mcctrl_usrdata *udp,
prefix, cpu_number);
list_for_each_entry(cache, &cpu->cache_list, chain) {
setup_one_cache_files(udp, cpu, cache);
setup_cpu_sysfs_cache_files(udp, cpu, cache);
}
dprintk("setup_one_cpu_files(%p,%p):\n", udp, cpu);
dprintk("setup_cpu_sysfs_files(%p,%p):\n", udp, cpu);
return;
} /* setup_one_cpu_files() */
} /* setup_cpu_sysfs_files() */
static void setup_cpu_files(struct mcctrl_usrdata *udp)
static void setup_cpus_sysfs_files_node_link(struct mcctrl_usrdata *udp)
{
int error;
int cpu;
struct sysfs_handle handle;
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
int node = linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)));
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/node/node%d", node);
if (error) {
panic("sysfsm_lookupf: node for CPU");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/cpu/cpu%d/node%d",
cpu, node);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
error = 0;
return;
}
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
{
int error;
struct cpu_topology *cpu;
dprintk("setup_cpu_file(%p)\n", udp);
error = get_cpu_mapping(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
"get_cpu_mapping failed. %d\n", error);
goto out;
}
error = get_cpu_topology(udp);
if (error) {
eprintk("mcctrl:setup_cpu_files:"
eprintk("mcctrl:setup_cpus_sysfs_files:"
"get_cpu_topology failed. %d\n", error);
goto out;
}
list_for_each_entry(cpu, &udp->cpu_topology_list, chain) {
setup_one_cpu_files(udp, cpu);
setup_cpu_sysfs_files(udp, cpu);
}
error = 0;
out:
dprintk("setup_cpu_file(%p):\n", udp);
return;
} /* setup_cpu_files() */
} /* setup_cpus_sysfs_files() */
static struct node_topology *get_one_node_topology(struct mcctrl_usrdata *udp,
struct ihk_node_topology *saved)
@ -629,8 +624,10 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
struct node_topology *topology;
dprintk("get_node_topology(%p)\n", udp);
for (node = 0; ; ++node) {
saved = ihk_device_get_node_topology(dev, node);
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
saved = ihk_device_get_node_topology(dev,
mckernel_numa_2_linux_numa(udp, node));
if (IS_ERR(saved)) {
break;
}
@ -647,6 +644,8 @@ static int get_node_topology(struct mcctrl_usrdata *udp)
goto out;
}
topology->mckernel_numa_id = node;
list_add(&topology->chain, &udp->node_topology_list);
}
@ -659,6 +658,7 @@ out:
static int setup_node_files(struct mcctrl_usrdata *udp)
{
int error;
int node;
struct node_topology *p;
struct sysfsm_bitmap_param param;
@ -670,16 +670,71 @@ static int setup_node_files(struct mcctrl_usrdata *udp)
goto out;
}
memset(&udp->numa_online, 0, sizeof(udp->numa_online));
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
node_set(node, udp->numa_online);
}
param.nbits = MAX_NUMNODES;
param.ptr = &udp->numa_online;
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/online");
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/possible");
list_for_each_entry(p, &udp->node_topology_list, chain) {
struct sysfs_handle handle;
int cpu;
size_t offset = 0;
param.nbits = nr_cpumask_bits;
param.ptr = &p->cpumap;
for (node = 0; node < udp->mem_info->n_numa_nodes; ++node) {
if (node > 0) {
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%s", " ");
}
offset += snprintf(&p->mckernel_numa_distance_s[offset],
NODE_DISTANCE_S_SIZE - offset, "%d",
node_distance(
mckernel_numa_2_linux_numa(udp, p->mckernel_numa_id),
mckernel_numa_2_linux_numa(udp, node)
));
}
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_s,
p->mckernel_numa_distance_s, 0444,
"/sys/devices/system/node/node%d/distance",
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pb, &param, 0444,
"/sys/devices/system/node/node%d/cpumap",
p->saved->node_number);
p->mckernel_numa_id);
sysfsm_createf(udp->os, SYSFS_SNOOPING_OPS_pbl, &param, 0444,
"/sys/devices/system/node/node%d/cpulist",
p->saved->node_number);
p->mckernel_numa_id);
/* Add CPU symlinks for this node */
for (cpu = 0; cpu < udp->cpu_info->n_cpus; ++cpu) {
if (linux_numa_2_mckernel_numa(udp,
cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu)))
!= p->mckernel_numa_id) {
continue;
}
error = sysfsm_lookupf(udp->os, &handle,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("sysfsm_lookupf(CPU in node)");
}
error = sysfsm_symlinkf(udp->os, handle,
"/sys/devices/system/node/node%d/cpu%d",
p->mckernel_numa_id, cpu);
if (error) {
panic("sysfsm_symlinkf(CPU in node)");
}
}
}
error = 0;
@ -1026,11 +1081,18 @@ void setup_sysfs_files(ihk_os_t os)
panic("sysfsm_unlinkf");
}
setup_local_snooping_samples(os);
//setup_local_snooping_samples(os);
setup_local_snooping_files(os);
setup_cpu_files(udp);
setup_cpus_sysfs_files(udp);
setup_node_files(udp);
setup_pci_files(udp);
setup_cpus_sysfs_files_node_link(udp);
//setup_pci_files(udp);
/* Indicate sysfs files setup completion for boot script */
error = sysfsm_mkdirf(os, NULL, "/sys/setup_complete");
if (error) {
panic("sysfsm_mkdir(complete)");
}
return;
} /* setup_files() */

View File

@ -1,7 +1,3 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR=@KMODDIR@
src = @abs_srcdir@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
@ -9,31 +5,40 @@ MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi)
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi)
else
ENABLE_BUILD=no
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
ifeq ($(ENABLE_BUILD),yes)
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
endif
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
@(cd linux-4.6.7; make clean)
install:
ifeq ($(ENABLE_BUILD),yes)
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make install)
endif

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,460 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static bool __read_mostly ovl_check_copy_up;
module_param_named(check_copy_up, ovl_check_copy_up, bool,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
if (f->f_inode == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
if (OVL_OPT_NOFSCHECK(opt)) {
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
continue;
} else {
error = size;
break;
}
}
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
unsigned opt = ovl_get_config_opt(dentry);
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
if (err)
goto out_cleanup;
inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out2;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,969 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
inode_unlock(udir);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
unsigned opt = ovl_get_config_opt(dentry);
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir, opt);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
inode_lock_nested(dir, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
inode_unlock(dir);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
/*
* Old dentry now lives in different location. Dentries in
* lowerstack are stale. We cannot drop them here because
* access to them is lockless. This could be only pure upper
* or opaque directory - numlower is zero. Or upper non-dir
* entry - its pureness is tracked by flag opaque.
*/
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,494 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
/*
* Check for permissions before trying to copy-up. This is redundant
* since it will be rechecked later by ->setattr() on upper dentry. But
* without this, copy-up can be triggered by just about anybody.
*
* We don't initialize inode->size, which just means that
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
err = inode_change_ok(dentry->d_inode, attr);
if (err)
return err;
err = ovl_want_write(dentry);
if (err)
goto out;
if (attr->ia_valid & ATTR_SIZE) {
struct inode *realinode = d_inode(ovl_dentry_real(dentry));
err = -ETXTBSY;
if (atomic_read(&realinode->i_writecount) < 0)
goto out_drop_write;
}
err = ovl_copy_up(dentry);
if (!err) {
struct inode *winode = NULL;
upperdentry = ovl_dentry_upper(dentry);
if (attr->ia_valid & ATTR_SIZE) {
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
goto out_drop_write;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
put_write_access(winode);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
ovl_reset_ovl_entry(&oe, alias);
}
realdentry = ovl_entry_real(oe, &is_upper);
if (ovl_is_default_permissions(inode)) {
struct kstat stat;
struct path realpath = { .dentry = realdentry };
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
err = vfs_getattr(&realpath, &stat);
if (err)
goto out_dput;
err = -ESTALE;
if ((stat.mode ^ inode->i_mode) & S_IFMT)
goto out_dput;
inode->i_mode = stat.mode;
inode->i_uid = stat.uid;
inode->i_gid = stat.gid;
err = generic_permission(inode, mask);
goto out_dput;
}
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
if (!dentry)
return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
int err;
struct path realpath;
enum ovl_path_type type;
unsigned opt = ovl_get_config_opt(dentry);
if (d_is_dir(dentry))
return d_backing_inode(dentry);
type = ovl_path_real(dentry, &realpath);
if (!OVL_OPT_NOCOPYUPW(opt) &&
ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
OVL_DEBUG("copyup: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
err = ovl_want_write(dentry);
if (err)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
if (err)
return ERR_PTR(err);
ovl_path_upper(dentry, &realpath);
}
if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
if (OVL_OPT_NOFSCHECK(opt)) {
if (realpath.dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC) {
OVL_DEBUG("sysfs: dentry=%pd4, i_ino=%lu\n",
dentry, dentry->d_inode->i_ino);
OVL_DEBUG("sysfs: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
if (!dentry->d_inode->i_private) {
dentry->d_inode->i_private = dentry->d_fsdata;
dentry->d_fsdata = realpath.dentry->d_fsdata;
}
}
}
return d_backing_inode(realpath.dentry);
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,222 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
//#define DEBUG
#ifdef DEBUG
#define OVL_DEBUG(format, ...) pr_err("[DEBUG] %s(): " format, __FUNCTION__, ##__VA_ARGS__)
#else
#define OVL_DEBUG(format, ...) {}
#endif
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
enum ovl_opt_bit {
__OVL_OPT_DEFAULT = 0,
__OVL_OPT_NOCOPYUPW = (1 << 0),
__OVL_OPT_NOFSCHECK = (1 << 1),
};
#define OVL_OPT_NOCOPYUPW(opt) ((opt) & __OVL_OPT_NOCOPYUPW)
#define OVL_OPT_NOFSCHECK(opt) ((opt) & __OVL_OPT_NOFSCHECK)
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
unsigned ovl_get_config_opt(struct dentry *dentry);
void ovl_reset_ovl_entry(struct ovl_entry **oe, struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
to->i_mode = from->i_mode;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,616 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
bool d_type_supported;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
inode_unlock(file_inode(file));
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
inode_unlock(inode);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
if (dentry->d_inode)
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
}
static int ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
return 0;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
return 0;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
int ovl_check_d_type_supported(struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
.d_type_supported = false,
};
err = ovl_dir_read(realpath, &rdd);
if (err)
return err;
return rdd.d_type_supported;
}

File diff suppressed because it is too large Load Diff

View File

@ -17,6 +17,20 @@
#include <sys/socket.h>
#include <arpa/inet.h>
/* From ihk/linux/include/ihk/ihk_host_user.h */
#define PHYS_CHUNKS_DESC_SIZE 8192
struct dump_mem_chunk {
unsigned long addr;
unsigned long size;
};
typedef struct dump_mem_chunks_s {
int nr_chunks;
struct dump_mem_chunk chunks[];
} dump_mem_chunks_t;
/* ---------- */
#define CPU_TID_BASE 1000000
struct options {
@ -53,6 +67,7 @@ static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static dump_mem_chunks_t *mem_chunks;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
@ -91,25 +106,35 @@ static uintptr_t virt_to_phys(uintptr_t va) {
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
int i;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
off = 0;
/* Check if pa is valid in any chunks and figure
* out the global offset in dump section */
for (i = 0; i < mem_chunks->nr_chunks; ++i) {
if (mem_chunks->chunks[i].addr <= pa &&
((pa + size) <= (mem_chunks->chunks[i].addr +
mem_chunks->chunks[i].size))) {
off += (pa - mem_chunks->chunks[i].addr);
break;
}
off += mem_chunks->chunks[i].size;
}
if (i == mem_chunks->nr_chunks) {
printf("read_physmem: invalid addr 0x%lx\n", pa);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
@ -167,6 +192,7 @@ enum {
CURRENT_OFFSET,
RUNQ_OFFSET,
CPU_STATUS_OFFSET,
IDLE_THREAD_OFFSET,
/* process */
CTX_OFFSET,
@ -204,6 +230,7 @@ static int setup_constants(void) {
printf("CURRENT_OFFSET: %ld\n", K(CURRENT_OFFSET));
printf("RUNQ_OFFSET: %ld\n", K(RUNQ_OFFSET));
printf("CPU_STATUS_OFFSET: %ld\n", K(CPU_STATUS_OFFSET));
printf("IDLE_THREAD_OFFSET: %ld\n", K(IDLE_THREAD_OFFSET));
printf("CTX_OFFSET: %ld\n", K(CTX_OFFSET));
printf("SCHED_LIST_OFFSET: %ld\n", K(SCHED_LIST_OFFSET));
printf("PROC_OFFSET: %ld\n", K(PROC_OFFSET));
@ -251,6 +278,64 @@ static int setup_threads(void) {
ihk_mc_switch_context = lookup_symbol("ihk_mc_switch_context");
if (0) printf("ihk_mc_switch_context: %lx\n", ihk_mc_switch_context);
/* Set up idle threads first */
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = v+K(IDLE_THREAD_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = cpu;
ti->lcpu = cpu;
ti->process = thread;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t head;
@ -448,6 +533,25 @@ static int setup_dump(char *fname) {
return 1;
}
mem_chunks = malloc(PHYS_CHUNKS_DESC_SIZE);
if (!mem_chunks) {
perror("allocating mem chunks descriptor: ");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physchunks");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, mem_chunks,
0, PHYS_CHUNKS_DESC_SIZE);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");

View File

@ -41,6 +41,7 @@
#include <sys/mman.h>
#include <asm/unistd.h>
#include <sched.h>
#include <dirent.h>
#include <termios.h>
#include <sys/ioctl.h>
@ -101,6 +102,22 @@ int __glob_argc = -1;
char **__glob_argv = 0;
#endif
#ifdef ENABLE_MCOVERLAYFS
#undef ENABLE_MCOVERLAYFS
#ifndef RHEL_RELEASE_CODE
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.6
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
#endif // RHEL_RELEASE_CODE == 7.2
#endif // RHEL_RELEASE_CODE
#endif // ENABLE_MCOVERLAYFS
typedef unsigned char cc_t;
typedef unsigned int speed_t;
typedef unsigned int tcflag_t;
@ -375,7 +392,7 @@ struct program_load_desc *load_interp(struct program_load_desc *desc0, FILE *fp)
unsigned char *dma_buf;
int lookup_exec_path(char *filename, char *path, int max_len)
int lookup_exec_path(char *filename, char *path, int max_len, int execvp)
{
int found;
int error;
@ -393,28 +410,27 @@ retry:
char *token, *string, *tofree;
char *PATH = getenv("COKERNEL_PATH");
if (!PATH) {
if (!execvp) {
if (strlen(filename) + 1 > max_len) {
return ENAMETOOLONG;
}
strcpy(path, filename);
error = access(path, X_OK);
if (error) {
return errno;
}
found = 1;
break;
}
if (!(PATH = getenv("COKERNEL_PATH"))) {
PATH = getenv("PATH");
}
if (strlen(filename) >= 255) {
return ENAMETOOLONG;
}
/* See first whether file is available in current working dir */
error = access(filename, X_OK);
if (error == 0) {
__dprintf("lookup_exec_path(): found %s in cwd\n", filename);
error = snprintf(path, max_len, "%s", filename);
if (error < 0 || error >= max_len) {
fprintf(stderr, "lookup_exec_path(): array too small?\n");
return ENOMEM;
}
found = 1;
break;
}
__dprintf("PATH: %s\n", PATH);
@ -442,6 +458,9 @@ retry:
}
free(tofree);
if(!found){
return ENOENT;
}
break;
}
@ -654,7 +673,7 @@ int load_elf_desc(char *filename, struct program_load_desc **desc_p,
return 0;
}
void transfer_image(int fd, struct program_load_desc *desc)
int transfer_image(int fd, struct program_load_desc *desc)
{
struct remote_transfer pt;
unsigned long s, e, flen, rpa;
@ -668,7 +687,10 @@ void transfer_image(int fd, struct program_load_desc *desc)
+ PAGE_SIZE - 1) & PAGE_MASK;
rpa = desc->sections[i].remote_pa;
fseek(fp, desc->sections[i].offset, SEEK_SET);
if (fseek(fp, desc->sections[i].offset, SEEK_SET) != 0) {
fprintf(stderr, "transfer_image(): error: seeking file position\n");
return -1;
}
flen = desc->sections[i].filesz;
__dprintf("seeked to %lx | size %ld\n",
@ -690,7 +712,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
if (lr > flen) {
lr = flen;
}
fread(dma_buf + l, 1, lr, fp);
if (fread(dma_buf + l, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
else if (flen > 0) {
@ -699,7 +734,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
} else {
lr = flen;
}
fread(dma_buf, 1, lr, fp);
if (fread(dma_buf, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
s += PAGE_SIZE;
@ -715,6 +763,8 @@ void transfer_image(int fd, struct program_load_desc *desc)
}
}
}
return 0;
}
void print_desc(struct program_load_desc *desc)
@ -837,7 +887,10 @@ struct thread_data_s {
pthread_mutex_t *lock;
pthread_barrier_t *init_ready;
} *thread_data;
int ncpu;
int n_threads;
pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@ -848,7 +901,7 @@ static void *main_loop_thread_func(void *arg)
struct thread_data_s *td = (struct thread_data_s *)arg;
td->tid = gettid();
td->remote_tid = (int)td->tid;
td->remote_tid = -1;
pthread_barrier_wait(&init_ready);
td->ret = main_loop(td->fd, td->cpu, td->lock);
@ -931,7 +984,10 @@ act_signalfd4(struct syscall_wait_desc *w)
flags |= O_NONBLOCK;
if(tmp & SFD_CLOEXEC)
flags |= O_CLOEXEC;
pipe2(sfd->sigpipe, flags);
if (pipe2(sfd->sigpipe, flags) < 0) {
perror("pipe2 failed:");
return -1;
}
sfd->next = sigfdtop;
sigfdtop = sfd;
rc = sfd->sigpipe[0];
@ -962,7 +1018,11 @@ act_signalfd4(struct syscall_wait_desc *w)
rc = -EBADF;
else{
info = (struct signalfd_siginfo *)w->sr.args[2];
write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo));
if (write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo))
!= sizeof(struct signalfd_siginfo)) {
fprintf(stderr, "error: writing sigpipe\n");
rc = -EBADF;
}
}
break;
}
@ -1068,9 +1128,9 @@ void init_worker_threads(int fd)
int i;
pthread_mutex_init(&lock, NULL);
pthread_barrier_init(&init_ready, NULL, ncpu + 2);
pthread_barrier_init(&init_ready, NULL, n_threads + 2);
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
int ret;
thread_data[i].fd = fd;
@ -1091,79 +1151,43 @@ void init_worker_threads(int fd)
}
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define READ_BUFSIZE 1024
static int isunshare(void)
static int find_mount_prefix(char *prefix)
{
int err = 0;
int ret;
int fd;
FILE *fp;
char *line = NULL;
size_t len = 0;
ssize_t read;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
int ret = 0;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
fp = fopen(proc_path, "r");
if (fp == NULL) {
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
while ((read = getline(&line, &len, fp)) != -1) {
if (strlen(line) < strlen(prefix))
continue;
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
if (!strncmp(line, prefix, strlen(prefix))) {
ret = 1;
break;
}
}
close(fd);
if (line)
free(line);
__dprintf("err=%d\n", err);
return err;
return ret;
}
static int isunshare(void)
{
return find_mount_prefix("mcoverlay /proc ");
}
#endif
#endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0
@ -1353,7 +1377,6 @@ int main(int argc, char **argv)
}
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
__dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX];
@ -1362,6 +1385,7 @@ int main(int argc, char **argv)
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
struct sys_umount_desc umount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
@ -1373,6 +1397,53 @@ int main(int argc, char **argv)
return 1;
}
/*
* Umount cgroup filesystems that may expose invalid NUMA
* information
*/
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu,cpuacct")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu,cpuacct";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu,cpuacct. (%s)\n",
strerror(errno));
}
}
else if (find_mount_prefix("cgroup /sys/fs/cgroup/cpu")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpu";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpu. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/cpuset")) {
umount_desc.dir_name = "/sys/fs/cgroup/cpuset";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/cpuset. (%s)\n",
strerror(errno));
}
}
if (find_mount_prefix("cgroup /sys/fs/cgroup/memory")) {
umount_desc.dir_name = "/sys/fs/cgroup/memory/";
if (ioctl(fd, MCEXEC_UP_SYS_UMOUNT,
(unsigned long)&umount_desc) != 0) {
fprintf(stderr,
"WARNING: Failed to umount cgroup/memory. (%s)\n",
strerror(errno));
}
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
@ -1401,12 +1472,11 @@ int main(int argc, char **argv)
} else if (error == -1) {
return 1;
}
#endif
#else
__dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) {
if (lookup_exec_path(argv[optind], path, sizeof(path), 1) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1;
}
@ -1418,7 +1488,7 @@ int main(int argc, char **argv)
/* Check whether shell script */
if (shell) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path)) != 0) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path), 0) != 0) {
fprintf(stderr, "error: finding file: %s\n", shell);
return 1;
}
@ -1480,7 +1550,21 @@ int main(int argc, char **argv)
return 1;
}
n_threads = ncpu;
/*
* XXX: keep thread_data ncpu sized despite that there are only
* n_threads worker threads in the pool so that signaling code
* keeps working.
*
* TODO: fix signaling code to be independent of TIDs.
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
if (!thread_data) {
fprintf(stderr, "error: allocating thread pool data\n");
return 1;
}
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
#if 0
@ -1522,7 +1606,10 @@ int main(int argc, char **argv)
}
print_desc(desc);
transfer_image(fd, desc);
if (transfer_image(fd, desc) < 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
fflush(stdout);
fflush(stderr);
@ -1561,7 +1648,7 @@ int main(int argc, char **argv)
return 1;
}
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
pthread_join(thread_data[i].thread_id, NULL);
}
@ -1618,21 +1705,110 @@ do_generic_syscall(
ret = -errno;
}
/* Overlayfs /sys/X directory lseek() problem work around */
if (w->sr.number == __NR_lseek && ret == -EINVAL) {
char proc_path[512];
char path[512];
struct stat sb;
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if (readlink(proc_path, path, sizeof(path)) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
/* Not in /sys? */
if (strncmp(path, "/sys/", 5))
goto out;
/* Stat */
if (stat(path, &sb) < 0) {
fprintf(stderr, "%s: error stat() failed for %s\n",
__FUNCTION__, path);
goto out;
}
/* Not dir? */
if ((sb.st_mode & S_IFMT) != S_IFDIR)
goto out;
ret = 0;
}
/* Fake that nodeX in /sys/devices/system/node do not exist,
* where X >= number of LWK NUMA nodes */
else if (w->sr.number == __NR_getdents && ret > 0) {
struct linux_dirent {
long d_ino;
off_t d_off;
unsigned short d_reclen;
char d_name[];
};
struct linux_dirent *d;
char *buf = (char *)w->sr.args[1];
int bpos = 0;
int nodes,len;
char proc_path[PATH_MAX];
char path[PATH_MAX];
sprintf(proc_path, "/proc/self/fd/%d", (int)w->sr.args[0]);
/* Get filename */
if ((len = readlink(proc_path, path, sizeof(path))) < 0) {
fprintf(stderr, "%s: error: readlink() failed for %s\n",
__FUNCTION__, proc_path);
goto out;
}
path[len] = 0;
/* Not /sys/devices/system/node ? */
if (strcmp(path, "/sys/devices/system/node"))
goto out;
nodes = ioctl(fd, MCEXEC_UP_GET_NODES, 0);
if (nodes == -1) {
goto out;
}
d = (struct linux_dirent *) (buf + bpos);
for (bpos = 0; bpos < ret; ) {
int nodeid, tmp_reclen;
d = (struct linux_dirent *) (buf + bpos);
if (sscanf(d->d_name, "node%d", &nodeid) != 1) {
bpos += d->d_reclen;
continue;
}
if (nodeid >= nodes) {
tmp_reclen = d->d_reclen;
memmove(buf + bpos,
buf + bpos + tmp_reclen,
ret - bpos - tmp_reclen);
ret -= tmp_reclen;
continue;
}
bpos += d->d_reclen;
}
}
out:
__dprintf("do_generic_syscall(%ld):%ld (%#lx)\n", w->sr.number, ret, ret);
return ret;
}
static void
kill_thread(unsigned long cpu)
kill_thread(unsigned long tid)
{
if(cpu >= 0 && cpu < ncpu){
pthread_kill(thread_data[cpu].thread_id, LOCALSIG);
}
else{
int i;
int i;
for (i = 0; i < ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
}
}
}
@ -1738,9 +1914,7 @@ char *
chgpath(char *in, char *buf)
{
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
return in;
#endif
#endif // ENABLE_MCOVERLAYFS
char *fn = in;
struct stat sb;
@ -1791,6 +1965,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
//pthread_mutex_lock(lock);
thread_data[cpu].remote_tid = w.sr.rtid;
switch (w.sr.number) {
case __NR_open:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
@ -1829,13 +2005,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
sig = 0;
term = 0;
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
/* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
}
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){
@ -1888,22 +2064,40 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
#endif
case __NR_gettid:{
int mode = w.sr.args[0];
int remote_pid = w.sr.args[1];
int newcpuid = w.sr.args[2];
int oldcpuid = w.sr.args[3];
int wtid = thread_data[newcpuid].remote_tid;
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
*/
if (w.sr.args[4] > 0) {
struct remote_transfer trans;
int i = 0;
int *tids = malloc(sizeof(int) * w.sr.args[4]);
if (!tids) {
fprintf(stderr, "__NR_gettid(): error allocating TIDs\n");
goto gettid_out;
}
if(mode == 0){
thread_data[ncpu].remote_tid = wtid;
thread_data[newcpuid].remote_tid = remote_pid;
}
else if(mode == 2){
thread_data[newcpuid].remote_tid = thread_data[oldcpuid].remote_tid;
thread_data[oldcpuid].remote_tid = wtid;
}
for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) {
tids[i] = thread_data[i].tid;
}
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
for (; i < ncpu; ++i) {
tids[i] = 0;
}
trans.userp = (void*)tids;
trans.rphys = w.sr.args[5];
trans.size = sizeof(int) * w.sr.args[4];
trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE;
if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
}
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
break;
}
@ -1945,7 +2139,9 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
close(pipefds[0]);
pid = fork();
if(pid != 0){
write(pipefds[1], &pid, sizeof pid);
if (write(pipefds[1], &pid, sizeof pid) != sizeof(pid)) {
fprintf(stderr, "error: writing pipefds\n");
}
exit(0);
}
}
@ -1954,7 +2150,9 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
int st;
close(pipefds[1]);
read(pipefds[0], &npid, sizeof npid);
if (read(pipefds[0], &npid, sizeof npid) != sizeof(npid)) {
fprintf(stderr, "error: reading pipefds\n");
}
close(pipefds[0]);
waitpid(pid, &st, 0);
pid = npid;
@ -1994,7 +2192,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
/* Reinit signals and syscall threads */
init_sigaction();
init_worker_threads(fd);
__dprintf("pid(%d): signals and syscall threads OK\n",
getpid());
@ -2008,6 +2205,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
goto fork_child_sync_pipe;
}
init_worker_threads(fd);
fork_child_sync_pipe:
sem_post(&fs->sem);
if (fs->status)
@ -2118,7 +2317,7 @@ fork_err:
shell = NULL;
filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path)))
if ((ret = lookup_exec_path(filename, path, sizeof(path), 0))
!= 0) {
goto return_execve1;
}
@ -2132,7 +2331,7 @@ fork_err:
/* Check whether shell script */
if (shell) {
if ((ret = lookup_exec_path(shell, shell_path,
sizeof(shell_path))) != 0) {
sizeof(shell_path), 0)) != 0) {
fprintf(stderr, "execve(): error: finding file: %s\n", shell);
goto return_execve1;
}
@ -2153,6 +2352,7 @@ fork_err:
strcpy(desc->shell_path, shell_path);
}
desc->enable_vdso = enable_vdso;
__dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n",
path, desc->num_sections);
@ -2210,7 +2410,10 @@ return_execve1:
__dprintf("%s", "execve(): transfer ELF desc OK\n");
transfer_image(fd, desc);
if (transfer_image(fd, desc) != 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
__dprintf("%s", "execve(): image transferred\n");
if (close_cloexec_fds(fd) < 0) {
@ -2220,6 +2423,23 @@ return_execve1:
ret = 0;
return_execve2:
#ifdef ENABLE_MCOVERLAYFS
{
struct sys_mount_desc mount_desc;
mount_desc.dev_name = NULL;
mount_desc.dir_name = "/proc";
mount_desc.type = NULL;
mount_desc.flags = MS_REMOUNT;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr,
"WARNING: failed to remount /proc (%s)\n",
strerror(errno));
}
}
#endif
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
@ -2262,6 +2482,53 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresuid:
ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setreuid:
ret = setreuid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setuid:
ret = setuid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresgid:
ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setregid:
ret = setregid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setgid:
ret = setgid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setfsgid:
ret = setfsgid(w.sr.args[0]);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_close:
if(w.sr.args[0] == fd)
ret = -EBADF;
@ -2295,7 +2562,9 @@ return_execve2:
break;
}
thread_data[cpu].remote_tid = -1;
//pthread_mutex_unlock(lock);
}
__dprint("timed out.\n");

View File

@ -6,12 +6,12 @@ OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g -fno-omit-frame-pointer -fno-inline -fno-inline-small-functions
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o
include $(SRC)/config/config.$(TARGET)
include $(IHKBASE)/Makefile.common
include @abs_builddir@/../../ihk/cokernel/Makefile.common
# CFLAGS += -I$(SRC)/../arch/$(IHKARCH)/kernel/include -I$(SRC)/../lib/include

View File

@ -9,7 +9,7 @@ V ?= $(VERBOSE)
KERNEL = kernel.img
KERNELS = $(addsuffix /$(KERNEL),$(addprefix $(O)/,$(BUILD_TARGET)))
SUBCMD_OPTS = V='$(V)'
SUBCMD_OPTS = V='$(V)' BUILD_IHK_COKERNEL=@abs_builddir@/../../ihk/cokernel
$(if $(O),,$(error Specify the compilation target directory))
#$(if $(shell ls $(IHKBASE)/Makefile),,\

View File

@ -65,7 +65,7 @@ void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
@ -78,13 +78,23 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d\n", bsp_hw_id);
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
kprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
num_processors++;
@ -199,7 +209,7 @@ cpu_sysfs_setup(void)
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
info[cpu].online = 1;
}
fake_cpu_infos = info;

View File

@ -32,7 +32,7 @@ void cpu_local_var_init(void)
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}

View File

@ -37,6 +37,8 @@ static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
@ -70,6 +72,9 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
@ -110,6 +115,7 @@ int __kprintf(const char *format, ...)
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
@ -169,6 +175,17 @@ int kprintf(const char *format, ...)
return len;
}
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);

View File

@ -78,51 +78,52 @@ static struct memobj *to_memobj(struct devobj *devobj)
/***********************************************************************
* devobj
*/
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags)
{
ihk_mc_user_context_t ctx;
struct pager_map_result result; // XXX: assumes contiguous physical
int error;
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
goto out;
}
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = len;
ihk_mc_syscall_arg3(&ctx) = off;
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
dkprintf("devobj_create:handle: %lx\n", result.handle);
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -140,11 +141,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out:
if (obj) {
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(obj);
}
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
return error;
}
@ -164,6 +166,8 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL;
uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
@ -192,7 +196,7 @@ static void devobj_release(struct memobj *memobj)
}
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(free_obj);
}
@ -204,7 +208,7 @@ static void devobj_release(struct memobj *memobj)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
const off_t pgoff = off / PAGE_SIZE;
struct devobj *obj = to_devobj(memobj);
int error;
uintptr_t pfn;
@ -216,7 +220,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
goto out;
}
ix = pgoff - obj->pfn_pgoff;

View File

@ -182,7 +182,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("fileobj_create(%d):create failed. %d\n", fd, error);
dkprintf("fileobj_create(%d):create failed. %d\n", fd, error);
goto out;
}
@ -258,13 +258,24 @@ static void fileobj_release(struct memobj *memobj)
/* zap page_list */
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, 1);
}
#if 0
count = ihk_atomic_sub_return(1, &page->count);
if (!((page->mode == PM_WILL_PAGEIO)
@ -281,7 +292,7 @@ static void fileobj_release(struct memobj *memobj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1);
#endif
}
obj_list_remove(free_obj);
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
@ -430,7 +441,7 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
panic("fileobj_get_page:invalid new page");
}
@ -502,10 +513,10 @@ static uintptr_t fileobj_copy_page(
memobj_lock(memobj);
for (;;) {
if (orgpage->mode != PM_MAPPED) {
if (!orgpage || orgpage->mode != PM_MAPPED) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"invalid cow page. %x\n",
memobj, orgpa, p2align, orgpage->mode);
memobj, orgpa, p2align, orgpage ? orgpage->mode : 0);
panic("fileobj_copy_page:invalid cow page");
}
count = ihk_atomic_read(&orgpage->count);
@ -527,7 +538,9 @@ static uintptr_t fileobj_copy_page(
memcpy(newkva, orgkva, pgsize);
ihk_atomic_dec(&orgpage->count);
newpa = virt_to_phys(newkva);
page_map(phys_to_page(newpa));
if (phys_to_page(newpa)) {
page_map(phys_to_page(newpa));
}
newkva = NULL; /* avoid ihk_mc_free_pages() */
break;
}
@ -563,6 +576,11 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
ssize_t ss;
page = phys_to_page(phys);
if (!page) {
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
__FUNCTION__, phys);
return 0;
}
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;

View File

@ -79,8 +79,6 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
extern struct sigpending *hassigpending(struct thread *thread);
int futex_cmpxchg_enabled;
/**

View File

@ -332,6 +332,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
goto err;
}
}
else {
vm->vdso_addr = NULL;
}
p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table);
@ -373,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys)
}
n = p->num_sections;
if (n > 16) {
kprintf("%s: ERROR: more ELF sections than 16??\n",
__FUNCTION__);
return -ENOMEM;
}
dkprintf("# of sections: %d\n", n);
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){
if((pn = kmalloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n,
IHK_MC_AP_NOWAIT)) == NULL){
ihk_mc_unmap_virtual(p, npages, 0);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -385,7 +394,7 @@ static int process_msg_prepare_process(unsigned long rphys)
+ sizeof(struct program_image_section) * n);
if((thread = create_thread(p->entry)) == NULL){
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -435,7 +444,7 @@ static int process_msg_prepare_process(unsigned long rphys)
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
vm->address_space->page_table);
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
@ -443,7 +452,7 @@ static int process_msg_prepare_process(unsigned long rphys)
return 0;
err:
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
destroy_thread(thread);
@ -452,7 +461,7 @@ err:
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0;
@ -521,46 +530,17 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
}
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
{
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread;
struct process *proc;
struct mcctrl_signal {
@ -572,22 +552,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
} *sp, info;
unsigned long pp;
int cpuid;
int ret = 0;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_PREPARE_PROCESS:
if (find_command_line("memdebug")) {
memcheckall();
if (runcount)
freecheck(runcount);
runcount++;
}
if((rc = process_msg_prepare_process(packet->arg)) == 0){
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
pckt.err = 0;
@ -600,19 +575,20 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
kprintf("No CPU available\n");
return -1;
ret = -1;
break;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1);
thread->tid = proc->pid;
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
@ -620,7 +596,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct thread *)packet->arg;
return 0;
ret = 0;
break;
/*
* Used for syscall offload reply message to explicitly schedule in
* the waiting thread
*/
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
thread = find_thread(0, packet->ttid, &lock);
if (!thread) {
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
__FUNCTION__, packet->ttid);
ret = -EINVAL;
break;
}
thread_unlock(thread, &lock);
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
__FUNCTION__, packet->ttid);
waitq_wakeup(&thread->scd_wq);
ret = 0;
break;
case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
@ -635,18 +633,25 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0;
ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid);
return 0;
ret = 0;
break;
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
@ -654,25 +659,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
return 0;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
return 0;
ret = 0;
break;
}
return 0;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return ret;
}
void init_host_syscall_channel(void)

View File

@ -19,11 +19,13 @@
* CPU Local Storage (cls)
*/
struct malloc_header {
unsigned int check;
struct kmalloc_header {
unsigned int front_magic;
unsigned int cpu_id;
struct malloc_header *next;
unsigned long size;
struct list_head list;
int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
};
#include <ihk/lock.h>
@ -38,8 +40,9 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var {
/* malloc */
struct malloc_header free_list;
struct malloc_header *remote_free_list;
struct list_head free_list;
struct list_head remote_free_list;
ihk_spinlock_t remote_free_list_lock;
struct thread idle;
struct process idle_proc;
@ -73,6 +76,7 @@ struct cpu_local_var {
int in_interrupt;
int no_preempt;
int timer_enabled;
int kmalloc_initialized;
} __attribute__((aligned(64)));

View File

@ -32,4 +32,6 @@ extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name);
extern int num_processors;
#endif

View File

@ -32,11 +32,10 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void __kfree(void *ptr);
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
void ___kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free);
int memcheckall();
int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#endif

View File

@ -141,6 +141,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
struct shmid_ds;
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
int zeroobj_create(struct memobj **objp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags);
#endif /* HEADER_MEMOBJ_H */

View File

@ -17,8 +17,9 @@
struct page {
struct list_head list;
struct list_head hash;
uint8_t mode;
uint8_t padding[3];
uint64_t phys;
ihk_atomic_t count;
off_t offset;
};
@ -38,9 +39,8 @@ enum page_mode {
struct page *phys_to_page(uintptr_t phys);
uintptr_t page_to_phys(struct page *page);
int page_unmap(struct page *page);
struct page *phys_to_page_insert_hash(uint64_t phys);
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
void free_pages(void *va, int npages);
void begin_free_pages_pending(void);
void finish_free_pages_pending(void);

View File

@ -22,6 +22,7 @@
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#include <bitops.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
@ -29,6 +30,7 @@
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
#define VR_DONTFORK 0x800
#define VR_DEMAND_PAGING 0x1000
#define VR_PRIVATE 0x2000
#define VR_LOCKED 0x4000
@ -160,13 +162,78 @@
#endif
#define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25
#define KERNEL_STACK_NR_PAGES 32
#define NOPHYS ((uintptr_t)-1)
#define PROCESS_NUMA_MASK_BITS 64
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
enum mpol_rebind_step {
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
MPOL_REBIND_NSTEP,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#include <waitq.h>
#include <futex.h>
//#define TRACK_SYSCALLS
struct resource_set;
struct process_hash;
struct thread_hash;
@ -304,6 +371,13 @@ struct vm_range {
int padding;
};
struct vm_range_numa_policy {
struct list_head list;
unsigned long start, end;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
@ -319,19 +393,21 @@ struct process_vm;
struct mckfd {
struct mckfd *next;
int fd;
int sig_no;
long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*fcntl_cb)(struct mckfd *, ihk_mc_user_context_t *);
};
#define SFD_CLOEXEC 02000000
#define SFD_NONBLOCK 04000
struct sig_common {
ihk_spinlock_t lock;
mcs_rwlock_lock_t lock;
ihk_atomic_t use;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
@ -346,6 +422,11 @@ struct sig_pending {
typedef void pgio_func_t(void *arg);
struct mcexec_tid {
int tid;
struct thread *thread;
};
/* Represents a node in the process fork tree, it may exist even after the
* corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a
@ -360,6 +441,9 @@ struct process {
// threads and children
struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* TID set of proxy process */
struct mcexec_tid *tids;
int nr_tids;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
@ -384,7 +468,7 @@ struct process {
// V +---- |
// PS_STOPPED -----+
// (PS_TRACED)
int exit_status;
int exit_status; // only for zombie
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
@ -496,6 +580,7 @@ struct thread {
// PS_TRACED
// PS_INTERRPUTIBLE
// PS_UNINTERRUPTIBLE
int exit_status;
// process vm
struct process_vm *vm;
@ -526,12 +611,20 @@ struct thread {
fp_regs_struct *fp_regs;
int in_syscall_offload;
#ifdef TRACK_SYSCALLS
int socc_enabled;
uint64_t *syscall_times;
uint32_t *syscall_cnts;
uint64_t *offload_times;
uint32_t *offload_cnts;
#endif // TRACK_SYSCALLS
// signal
struct sig_common *sigcommon;
sigset_t sigmask;
stack_t sigstack;
struct list_head sigpending;
ihk_spinlock_t sigpendinglock;
mcs_rwlock_lock_t sigpendinglock;
volatile int sigevent;
// gpio
@ -556,8 +649,13 @@ struct thread {
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
/* Syscall offload wait queue head */
struct waitq scd_wq;
};
#define VM_RANGE_CACHE_SIZE 4
struct process_vm {
struct address_space *address_space;
struct list_head vm_range_list;
@ -580,6 +678,12 @@ struct process_vm {
int exiting;
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
/* Protected by memory_range_lock */
struct list_head vm_range_numa_policy_list;
struct vm_range *range_cache[VM_RANGE_CACHE_SIZE];
int range_cache_ind;
};
static inline int has_cap_ipc_lock(struct thread *th)
@ -675,5 +779,6 @@ void chain_process(struct process *);
void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
#endif

View File

@ -31,6 +31,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@ -117,28 +118,6 @@ struct user_desc {
unsigned int lm:1;
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
};
struct program_image_section {
unsigned long vaddr;
unsigned long len;
@ -210,13 +189,58 @@ struct ikc_scd_init_param {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -307,7 +331,7 @@ void delete_proc_procfs_files(int pid);
void create_os_procfs_files(void);
void delete_os_procfs_files(void);
#define PROCFS_NAME_MAX 1000
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */

View File

@ -320,7 +320,7 @@ static void setup_remote_snooping_samples(void)
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
//setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0;
@ -371,7 +371,7 @@ int main(void)
}
kmsg_init(mode);
kputs("MCK started.\n");
kputs("IHK/McKernel started.\n");
arch_init();
@ -393,7 +393,7 @@ int main(void)
futex_init();
kputs("MCK/IHK booted.\n");
kputs("IHK/McKernel booted.\n");
#ifdef DCFA_KMOD
mc_cmd_client_init();

File diff suppressed because it is too large Load Diff

View File

@ -53,7 +53,6 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm);
extern void release_fp_regs(struct thread *proc);
extern void save_fp_regs(struct thread *proc);
extern void restore_fp_regs(struct thread *proc);
void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void __runq_add_proc(struct thread *proc, int cpu_id);
extern void terminate_host(int pid);
extern void lapic_timer_enable(unsigned int clocks);
@ -75,7 +74,6 @@ init_process(struct process *proc, struct process *parent)
{
/* These will be filled out when changing status */
proc->pid = -1;
proc->exit_status = -1;
proc->status = PS_RUNNING;
if(parent){
@ -205,15 +203,33 @@ detach_address_space(struct address_space *asp, int pid)
static int
init_process_vm(struct process *owner, struct address_space *asp, struct process_vm *vm)
{
int i;
ihk_mc_spinlock_init(&vm->memory_range_lock);
ihk_mc_spinlock_init(&vm->page_table_lock);
ihk_atomic_set(&vm->refcount, 1);
INIT_LIST_HEAD(&vm->vm_range_list);
INIT_LIST_HEAD(&vm->vm_range_numa_policy_list);
vm->address_space = asp;
vm->proc = owner;
vm->exiting = 0;
memset(&vm->numa_mask, 0, sizeof(vm->numa_mask));
for (i = 0; i < ihk_mc_get_nr_numa_nodes(); ++i) {
if (i >= PROCESS_NUMA_MASK_BITS) {
kprintf("%s: error: NUMA id is larger than mask size!\n",
__FUNCTION__);
break;
}
set_bit(i, &vm->numa_mask[0]);
}
vm->numa_mem_policy = MPOL_DEFAULT;
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
vm->range_cache[i] = NULL;
}
vm->range_cache_ind = 0;
return 0;
}
@ -261,10 +277,10 @@ create_thread(unsigned long user_pc)
dkprintf("fork(): sigshared\n");
ihk_atomic_set(&thread->sigcommon->use, 1);
ihk_mc_spinlock_init(&thread->sigcommon->lock);
mcs_rwlock_init(&thread->sigcommon->lock);
INIT_LIST_HEAD(&thread->sigcommon->sigpending);
ihk_mc_spinlock_init(&thread->sigpendinglock);
mcs_rwlock_init(&thread->sigpendinglock);
INIT_LIST_HEAD(&thread->sigpending);
thread->sigstack.ss_sp = NULL;
@ -281,6 +297,7 @@ create_thread(unsigned long user_pc)
if(init_process_vm(proc, asp, vm) != 0){
goto err;
}
thread->exit_status = -1;
cpu_set(ihk_mc_get_processor_id(), &thread->vm->address_space->cpu_set,
&thread->vm->address_space->cpu_set_lock);
@ -338,6 +355,10 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
proc = org->proc;
thread->vm = org->vm;
thread->proc = proc;
thread->sigstack.ss_sp = NULL;
thread->sigstack.ss_flags = SS_DISABLE;
thread->sigstack.ss_size = 0;
}
/* fork() */
else {
@ -368,6 +389,11 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
kfree(proc);
goto err_free_proc;
}
memcpy(&proc->vm->numa_mask, &org->vm->numa_mask,
sizeof(proc->vm->numa_mask));
proc->vm->numa_mem_policy =
org->vm->numa_mem_policy;
thread->proc = proc;
thread->vm = proc->vm;
@ -383,9 +409,15 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
goto err_free_proc;
}
thread->vm->vdso_addr = org->vm->vdso_addr;
thread->vm->vvar_addr = org->vm->vvar_addr;
thread->proc->maxrss = org->proc->maxrss;
thread->vm->currss = org->vm->currss;
thread->sigstack.ss_sp = org->sigstack.ss_sp;
thread->sigstack.ss_flags = org->sigstack.ss_flags;
thread->sigstack.ss_size = org->sigstack.ss_size;
dkprintf("fork(): copy_user_ranges() OK\n");
}
@ -409,14 +441,11 @@ clone_thread(struct thread *org, unsigned long pc, unsigned long sp,
memcpy(thread->sigcommon->action, org->sigcommon->action,
sizeof(struct k_sigaction) * _NSIG);
ihk_atomic_set(&thread->sigcommon->use, 1);
ihk_mc_spinlock_init(&thread->sigcommon->lock);
mcs_rwlock_init(&thread->sigcommon->lock);
INIT_LIST_HEAD(&thread->sigcommon->sigpending);
// TODO: copy signalfd
}
thread->sigstack.ss_sp = NULL;
thread->sigstack.ss_flags = SS_DISABLE;
thread->sigstack.ss_size = 0;
ihk_mc_spinlock_init(&thread->sigpendinglock);
mcs_rwlock_init(&thread->sigpendinglock);
INIT_LIST_HEAD(&thread->sigpending);
thread->sigmask = org->sigmask;
@ -566,6 +595,9 @@ static int copy_user_ranges(struct process_vm *vm, struct process_vm *orgvm)
break;
}
if(src_range->flag & VR_DONTFORK)
continue;
range = kmalloc(sizeof(struct vm_range), IHK_MC_AP_NOWAIT);
if (!range) {
goto err_rollback;
@ -707,6 +739,7 @@ int join_process_memory_range(struct process_vm *vm,
struct vm_range *surviving, struct vm_range *merging)
{
int error;
int i;
dkprintf("join_process_memory_range(%p,%lx-%lx,%lx-%lx)\n",
vm, surviving->start, surviving->end,
@ -735,7 +768,11 @@ int join_process_memory_range(struct process_vm *vm,
memobj_release(merging->memobj);
}
list_del(&merging->list);
ihk_mc_free(merging);
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
if (vm->range_cache[i] == merging)
vm->range_cache[i] = surviving;
}
kfree(merging);
error = 0;
out:
@ -748,7 +785,7 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
{
const intptr_t start0 = range->start;
const intptr_t end0 = range->end;
int error;
int error, i;
intptr_t start;
intptr_t end;
struct vm_range *neighbor;
@ -831,8 +868,13 @@ int free_process_memory_range(struct process_vm *vm, struct vm_range *range)
if (range->memobj) {
memobj_release(range->memobj);
}
list_del(&range->list);
ihk_mc_free(range);
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
if (vm->range_cache[i] == range)
vm->range_cache[i] = NULL;
}
kfree(range);
dkprintf("free_process_memory_range(%p,%lx-%lx): 0\n",
vm, start0, end0);
@ -958,7 +1000,6 @@ enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fa
return attr;
}
/* XXX: インデントを揃える必要がある */
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
@ -1049,6 +1090,7 @@ int add_process_memory_range(struct process_vm *vm,
struct vm_range *lookup_process_memory_range(
struct process_vm *vm, uintptr_t start, uintptr_t end)
{
int i;
struct vm_range *range = NULL;
dkprintf("lookup_process_memory_range(%p,%lx,%lx)\n", vm, start, end);
@ -1057,6 +1099,16 @@ struct vm_range *lookup_process_memory_range(
goto out;
}
for (i = 0; i < VM_RANGE_CACHE_SIZE; ++i) {
int c_i = (i + vm->range_cache_ind) % VM_RANGE_CACHE_SIZE;
if (!vm->range_cache[c_i])
continue;
if (vm->range_cache[c_i]->start <= start &&
vm->range_cache[c_i]->end >= end)
return vm->range_cache[c_i];
}
list_for_each_entry(range, &vm->vm_range_list, list) {
if (end <= range->start) {
break;
@ -1068,6 +1120,12 @@ struct vm_range *lookup_process_memory_range(
range = NULL;
out:
if (range) {
vm->range_cache_ind = (vm->range_cache_ind - 1 + VM_RANGE_CACHE_SIZE)
% VM_RANGE_CACHE_SIZE;
vm->range_cache[vm->range_cache_ind] = range;
}
dkprintf("lookup_process_memory_range(%p,%lx,%lx): %p %lx-%lx\n",
vm, start, end, range,
range? range->start: 0, range? range->end: 0);
@ -1508,7 +1566,7 @@ static int page_fault_process_memory_range(struct process_vm *vm, struct vm_rang
}
}
if (phys == NOPHYS) {
void *virt;
void *virt = NULL;
size_t npages;
retry:
@ -1529,9 +1587,13 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate new page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out;
}
dkprintf("%s: clearing 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memset(virt, 0, pgsize);
phys = virt_to_phys(virt);
page_map(phys_to_page(phys));
if (phys_to_page(phys)) {
page_map(phys_to_page(phys));
}
}
}
else {
@ -1543,10 +1605,12 @@ retry:
attr = arch_vrflag_to_ptattr(range->flag | memobj_flag, reason, ptep);
/*****/
if (((range->flag & VR_PRIVATE)
|| ((reason & PF_PATCH)
&& !(range->flag & VR_PROT_WRITE)))
&& (!page || page_is_in_memobj(page) || page_is_multi_mapped(page))) {
if (((range->flag & VR_PRIVATE) ||
((reason & PF_PATCH) && !(range->flag & VR_PROT_WRITE)))
&& ((!page && phys == NOPHYS) || (page &&
(page_is_in_memobj(page) ||
page_is_multi_mapped(page))))) {
if (!(attr & PTATTR_DIRTY)) {
attr &= ~PTATTR_WRITABLE;
}
@ -1561,6 +1625,8 @@ retry:
kprintf("page_fault_process_memory_range(%p,%lx-%lx %lx,%lx,%lx):cannot allocate copy page. %d\n", vm, range->start, range->end, range->flag, fault_addr, reason, error);
goto out;
}
dkprintf("%s: copying 0x%lx:%lu\n",
__FUNCTION__, pgaddr, pgsize);
memcpy(virt, phys_to_virt(phys), pgsize);
phys = virt_to_phys(virt);
@ -1624,10 +1690,9 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
range = lookup_process_memory_range(vm, fault_addr, fault_addr+1);
if (range == NULL) {
error = -EFAULT;
dkprintf("[%d]do_page_fault_process_vm(%p,%lx,%lx):"
"out of range. %d\n",
ihk_mc_get_processor_id(), vm,
fault_addr0, reason, error);
dkprintf("do_page_fault_process_vm(): vm: %p, addr: %p, reason: %lx):"
"out of range: %d\n",
vm, fault_addr0, reason, error);
goto out;
}
@ -1641,14 +1706,34 @@ static int do_page_fault_process_vm(struct process_vm *vm, void *fault_addr0, ui
"access denied. %d\n",
ihk_mc_get_processor_id(), vm,
fault_addr0, reason, error);
kprintf("%s: reason: %s%s%s%s%s%s%s\n", __FUNCTION__,
(reason & PF_PROT) ? "PF_PROT " : "",
(reason & PF_WRITE) ? "PF_WRITE " : "",
(reason & PF_USER) ? "PF_USER " : "",
(reason & PF_RSVD) ? "PF_RSVD " : "",
(reason & PF_INSTR) ? "PF_INSTR " : "",
(reason & PF_PATCH) ? "PF_PATCH " : "",
(reason & PF_POPULATE) ? "PF_POPULATE " : "");
kprintf("%s: range->flag & (%s%s%s)\n", __FUNCTION__,
(range->flag & VR_PROT_READ) ? "VR_PROT_READ " : "",
(range->flag & VR_PROT_WRITE) ? "VR_PROT_WRITE " : "",
(range->flag & VR_PROT_EXEC) ? "VR_PROT_EXEC " : "");
if (((range->flag & VR_PROT_MASK) == VR_PROT_NONE))
kprintf("if (((range->flag & VR_PROT_MASK) == VR_PROT_NONE))\n");
if (((reason & PF_WRITE) && !(reason & PF_PATCH)))
kprintf("if (((reason & PF_WRITE) && !(reason & PF_PATCH)))\n");
if (!(range->flag & VR_PROT_WRITE))
if (!(range->flag & VR_PROT_WRITE)) {
kprintf("if (!(range->flag & VR_PROT_WRITE))\n");
if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC))
//kprintf("setting VR_PROT_WRITE\n");
//range->flag |= VR_PROT_WRITE;
//goto cont;
}
if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC)) {
kprintf("if ((reason & PF_INSTR) && !(range->flag & VR_PROT_EXEC))\n");
//kprintf("setting VR_PROT_EXEC\n");
//range->flag |= VR_PROT_EXEC;
//goto cont;
}
goto out;
}
@ -1868,14 +1953,14 @@ unsigned long extend_process_region(struct process_vm *vm,
aligned_end = (aligned_end + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
/* Fill in the gap between old_aligned_end and aligned_end
* with regular pages */
if((p = allocate_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT,
if((p = ihk_mc_alloc_pages((aligned_end - old_aligned_end) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){
return end;
}
if((rc = add_process_memory_range(vm, old_aligned_end,
aligned_end, virt_to_phys(p), flag,
LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_end - old_aligned_end) >> PAGE_SHIFT);
return end;
}
@ -1888,7 +1973,7 @@ unsigned long extend_process_region(struct process_vm *vm,
(LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
address = aligned_new_end;
if((p = allocate_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT,
if((p = ihk_mc_alloc_pages((aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT,
IHK_MC_AP_NOWAIT)) == NULL){
return end;
}
@ -1896,16 +1981,16 @@ unsigned long extend_process_region(struct process_vm *vm,
p_aligned = ((unsigned long)p + (LARGE_PAGE_SIZE - 1)) & LARGE_PAGE_MASK;
if (p_aligned > (unsigned long)p) {
free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (p_aligned - (unsigned long)p) >> PAGE_SHIFT);
}
free_pages(
ihk_mc_free_pages(
(void *)(p_aligned + aligned_new_end - aligned_end),
(LARGE_PAGE_SIZE - (p_aligned - (unsigned long)p)) >> PAGE_SHIFT);
if((rc = add_process_memory_range(vm, aligned_end,
aligned_new_end, virt_to_phys((void *)p_aligned),
flag, LARGE_PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_new_end - aligned_end + LARGE_PAGE_SIZE) >> PAGE_SHIFT);
return end;
}
@ -1923,7 +2008,7 @@ unsigned long extend_process_region(struct process_vm *vm,
p=0;
}else{
p = allocate_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT);
p = ihk_mc_alloc_pages((aligned_new_end - aligned_end) >> PAGE_SHIFT, IHK_MC_AP_NOWAIT);
if (!p) {
return end;
@ -1932,7 +2017,7 @@ unsigned long extend_process_region(struct process_vm *vm,
if((rc = add_process_memory_range(vm, aligned_end, aligned_new_end,
(p==0?0:virt_to_phys(p)), flag, NULL, 0,
PAGE_SHIFT)) != 0){
free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT);
ihk_mc_free_pages(p, (aligned_new_end - aligned_end) >> PAGE_SHIFT);
return end;
}
@ -2045,6 +2130,7 @@ release_process(struct process *proc)
mcs_rwlock_writer_unlock(&parent->children_lock, &lock);
}
if (proc->tids) kfree(proc->tids);
kfree(proc);
}
@ -2150,6 +2236,23 @@ release_sigcommon(struct sig_common *sigcommon)
kfree(sigcommon);
}
/*
* Release the TID from the process' TID set corresponding to this thread.
* NOTE: threads_lock must be held.
*/
void __release_tid(struct process *proc, struct thread *thread) {
int i;
for (i = 0; i < proc->nr_tids; ++i) {
if (proc->tids[i].thread != thread) continue;
proc->tids[i].thread = NULL;
dkprintf("%s: tid %d has been released by %p\n",
__FUNCTION__, thread->tid, thread);
break;
}
}
void destroy_thread(struct thread *thread)
{
struct sig_pending *pending;
@ -2166,6 +2269,7 @@ void destroy_thread(struct thread *thread)
mcs_rwlock_writer_lock(&proc->threads_lock, &lock);
list_del(&thread->siblings_list);
__release_tid(proc, thread);
mcs_rwlock_writer_unlock(&proc->threads_lock, &lock);
cpu_clear(thread->cpu_id, &thread->vm->address_space->cpu_set,
@ -2303,6 +2407,8 @@ static void idle(void)
}
if (v->status == CPU_STATUS_IDLE ||
v->status == CPU_STATUS_RESERVED) {
/* No work to do? Consolidate the kmalloc free list */
kmalloc_consolidate_free_list();
cpu_safe_halt();
}
else {
@ -2416,6 +2522,7 @@ void sched_init(void)
ihk_mc_init_context(&idle_thread->ctx, NULL, idle);
ihk_mc_spinlock_init(&idle_thread->vm->memory_range_lock);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_list);
INIT_LIST_HEAD(&idle_thread->vm->vm_range_numa_policy_list);
idle_thread->proc->pid = 0;
idle_thread->tid = ihk_mc_get_processor_id();
@ -2491,7 +2598,6 @@ static void do_migrate(void)
cur_v->runq_len -= 1;
old_cpu_id = req->thread->cpu_id;
req->thread->cpu_id = cpu_id;
settid(req->thread, 2, cpu_id, old_cpu_id);
list_add_tail(&req->thread->sched_list, &v->runq);
v->runq_len += 1;
@ -2541,13 +2647,8 @@ void schedule(void)
struct thread *last;
if (cpu_local_var(no_preempt)) {
dkprintf("no schedule() while no preemption! \n");
return;
}
if (cpu_local_var(current)
&& cpu_local_var(current)->in_syscall_offload) {
dkprintf("no schedule() while syscall offload!\n");
kprintf("%s: WARNING can't schedule() while no preemption, cnt: %d\n",
__FUNCTION__, cpu_local_var(no_preempt));
return;
}
@ -2576,9 +2677,10 @@ redo:
if (v->flags & CPU_FLAG_NEED_MIGRATE) {
next = &cpu_local_var(idle);
} else {
/* Pick a new running process */
/* Pick a new running process or one that has a pending signal */
list_for_each_entry_safe(thread, tmp, &(v->runq), sched_list) {
if (thread->status == PS_RUNNING) {
if (thread->status == PS_RUNNING ||
(thread->status == PS_INTERRUPTIBLE && hassigpending(thread))) {
next = thread;
break;
}
@ -2622,7 +2724,9 @@ redo:
restore_fp_regs(next);
}
ihk_mc_load_page_table(next->vm->address_space->page_table);
if (prev && prev->vm->address_space->page_table !=
next->vm->address_space->page_table)
ihk_mc_load_page_table(next->vm->address_space->page_table);
dkprintf("[%d] schedule: tlsblock_base: 0x%lX\n",
ihk_mc_get_processor_id(), next->tlsblock_base);
@ -2704,9 +2808,11 @@ sched_wakeup_thread(struct thread *thread, int valid_states)
int spin_slept = 0;
unsigned long irqstate;
struct cpu_local_var *v = get_cpu_local_var(thread->cpu_id);
struct process *proc = thread->proc;
struct mcs_rwlock_node updatelock;
dkprintf("sched_wakeup_process,proc->pid=%d,valid_states=%08x,proc->status=%08x,proc->cpu_id=%d,my cpu_id=%d\n",
thread->proc->pid, valid_states, thread->status, thread->cpu_id, ihk_mc_get_processor_id());
proc->pid, valid_states, thread->status, thread->cpu_id, ihk_mc_get_processor_id());
irqstate = ihk_mc_spinlock_lock(&(thread->spin_sleep_lock));
if (thread->spin_sleep > 0) {
@ -2726,7 +2832,10 @@ sched_wakeup_thread(struct thread *thread, int valid_states)
irqstate = ihk_mc_spinlock_lock(&(v->runq_lock));
if (thread->status & valid_states) {
xchg4((int *)(&thread->proc->status), PS_RUNNING);
mcs_rwlock_writer_lock_noirq(&proc->update_lock, &updatelock);
if(proc->status != PS_EXITED)
proc->status = PS_RUNNING;
mcs_rwlock_writer_unlock_noirq(&proc->update_lock, &updatelock);
xchg4((int *)(&thread->status), PS_RUNNING);
status = 0;
}
@ -2849,6 +2958,7 @@ find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock)
if(tid <= 0)
return NULL;
mcs_rwlock_reader_lock(&thash->lock[hash], lock);
retry:
list_for_each_entry(thread, &thash->list[hash], hash_list){
if(thread->tid == tid){
if(pid <= 0)
@ -2857,6 +2967,13 @@ find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock)
return thread;
}
}
/* If no thread with pid == tid was found, then we may be looking for a
* specific thread (not the main thread of the process), try to find it
* based on tid only */
if (pid > 0 && pid == tid) {
pid = 0;
goto retry;
}
mcs_rwlock_reader_unlock(&thash->lock[hash], lock);
return NULL;
}

View File

@ -23,6 +23,8 @@
#include <process.h>
#include <page.h>
#include <mman.h>
#include <bitmap.h>
#include <init.h>
//#define DEBUG_PRINT_PROCFS
@ -35,6 +37,7 @@
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
extern int osnum;
@ -281,6 +284,13 @@ process_procfs_request(unsigned long rarg)
ans = -EIO;
goto end;
}
if (pa < ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0) ||
pa >= ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0)) {
ans = -EIO;
goto end;
}
va = phys_to_virt(pa);
if(readwrite)
memcpy(va, buf + ans, size);
@ -397,12 +407,34 @@ process_procfs_request(unsigned long rarg)
/*
* mcos%d/PID/status
*/
#define BITMASKS_BUF_SIZE 2048
if (strcmp(p, "status") == 0) {
extern int num_processors; /* kernel/ap.c */
struct vm_range *range;
unsigned long lockedsize = 0;
char tmp[1024];
char *tmp;
char *bitmasks;
int bitmasks_offset = 0;
char *cpu_bitmask, *cpu_list, *numa_bitmask, *numa_list;
int len;
tmp = kmalloc(8192, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status buffer\n",
__FUNCTION__);
ans = 0;
goto end;
}
bitmasks = kmalloc(BITMASKS_BUF_SIZE, IHK_MC_AP_CRITICAL);
if (!tmp) {
kprintf("%s: error allocating /proc/self/status bitmaks buffer\n",
__FUNCTION__);
kfree(tmp);
ans = 0;
goto end;
}
ihk_mc_spinlock_lock_noirq(&proc->vm->memory_range_lock);
list_for_each_entry(range, &proc->vm->vm_range_list, list) {
if(range->flag & VR_LOCKED)
@ -410,13 +442,42 @@ process_procfs_request(unsigned long rarg)
}
ihk_mc_spinlock_unlock_noirq(&proc->vm->memory_range_lock);
cpu_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(cpu_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, num_processors);
bitmasks_offset++;
cpu_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(cpu_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
thread->cpu_set.__bits, __CPU_SETSIZE);
bitmasks_offset++;
numa_bitmask = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnprintf(numa_bitmask,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
numa_list = &bitmasks[bitmasks_offset];
bitmasks_offset += bitmap_scnlistprintf(numa_list,
BITMASKS_BUF_SIZE - bitmasks_offset,
proc->vm->numa_mask, PROCESS_NUMA_MASK_BITS);
bitmasks_offset++;
sprintf(tmp,
"Uid:\t%d\t%d\t%d\t%d\n"
"Gid:\t%d\t%d\t%d\t%d\n"
"VmLck:\t%9lu kB\n",
"VmLck:\t%9lu kB\n"
"Cpus_allowed:\t%s\n"
"Cpus_allowed_list:\t%s\n"
"Mems_allowed:\t%s\n"
"Mems_allowed_list:\t%s\n",
proc->ruid, proc->euid, proc->suid, proc->fsuid,
proc->rgid, proc->egid, proc->sgid, proc->fsgid,
(lockedsize + 1023) >> 10);
(lockedsize + 1023) >> 10,
cpu_bitmask, cpu_list, numa_bitmask, numa_list);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
@ -430,6 +491,8 @@ process_procfs_request(unsigned long rarg)
ans = 0;
eof = 1;
}
kfree(tmp);
kfree(bitmasks);
goto end;
}

View File

@ -240,14 +240,24 @@ void shmobj_destroy(struct shmobj *obj)
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, npages);
}
#if 0
dkprintf("shmobj_destroy(%p):"
"release page. %p %#lx %d %d",
obj, page, page_to_phys(page),
@ -265,7 +275,8 @@ void shmobj_destroy(struct shmobj *obj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), npages);
ihk_mc_free_pages(phys_to_virt(page_to_phys(page)), npages);
#endif
}
if (obj->index < 0) {
kfree(obj);
@ -404,7 +415,7 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("shmobj_get_page(%p,%#lx,%d,%p):"
"page %p %#lx %d %d %#lx\n",

File diff suppressed because it is too large Load Diff

View File

@ -75,7 +75,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n",
ops, instance, mode, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_createf:allocate_pages failed. %d\n", error);
@ -134,7 +134,7 @@ sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n",
@ -156,7 +156,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error);
@ -208,7 +208,7 @@ sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error);
@ -229,7 +229,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error);
@ -279,7 +279,7 @@ sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n",
@ -301,7 +301,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error);
@ -353,7 +353,7 @@ sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error);
@ -374,7 +374,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error);
@ -423,7 +423,7 @@ sysfs_unlinkf(int flags, const char *fmt, ...)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error);
@ -601,14 +601,14 @@ sysfs_init(void)
}
sysfs_data_bufsize = PAGE_SIZE;
sysfs_data_buf = allocate_pages(1, IHK_MC_AP_NOWAIT);
sysfs_data_buf = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!sysfs_data_buf) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error);
goto out;
}
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
param = ihk_mc_alloc_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(param) failed. %d\n",
@ -644,7 +644,7 @@ sysfs_init(void)
error = 0;
out:
if (param) {
free_pages(param, 1);
ihk_mc_free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_init(): %d\n", error);

View File

@ -112,7 +112,7 @@ static int alloc_zeroobj(void)
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
fkprintf("alloc_zeroobj():"
@ -172,6 +172,10 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align,
struct zeroobj *obj = to_zeroobj(memobj);
struct page *page;
/* Don't bother about zero page, page fault handler will
* allocate and clear pages */
return 0;
dkprintf("zeroobj_get_page(%p,%#lx,%d,%p)\n",
memobj, off, p2align, physp);
if (off & ~PAGE_MASK) {

1179
lib/bitmap.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -140,3 +140,58 @@ found:
return result + ffz(tmp);
}
/**
* hweightN - returns the hamming weight of a N-bit word
* @x: the word to weigh
*
* The Hamming Weight of a number is the total number of bits set in it.
*/
unsigned int __sw_hweight32(unsigned int w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x55555555;
w = (w & 0x33333333) + ((w >> 2) & 0x33333333);
w = (w + (w >> 4)) & 0x0f0f0f0f;
return (w * 0x01010101) >> 24;
#else
unsigned int res = w - ((w >> 1) & 0x55555555);
res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
res = (res + (res >> 4)) & 0x0F0F0F0F;
res = res + (res >> 8);
return (res + (res >> 16)) & 0x000000FF;
#endif
}
unsigned int __sw_hweight16(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x5555);
res = (res & 0x3333) + ((res >> 2) & 0x3333);
res = (res + (res >> 4)) & 0x0F0F;
return (res + (res >> 8)) & 0x00FF;
}
unsigned int __sw_hweight8(unsigned int w)
{
unsigned int res = w - ((w >> 1) & 0x55);
res = (res & 0x33) + ((res >> 2) & 0x33);
return (res + (res >> 4)) & 0x0F;
}
unsigned long __sw_hweight64(uint64_t w)
{
#ifdef ARCH_HAS_FAST_MULTIPLIER
w -= (w >> 1) & 0x5555555555555555ul;
w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
return (w * 0x0101010101010101ul) >> 56;
#else
uint64_t res = w - ((w >> 1) & 0x5555555555555555ul);
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
res = res + (res >> 8);
res = res + (res >> 16);
return (res + (res >> 32)) & 0x00000000000000FFul;
#endif
}

307
lib/include/bitmap.h Normal file
View File

@ -0,0 +1,307 @@
#ifndef __LINUX_BITMAP_H
#define __LINUX_BITMAP_H
#include <types.h>
#include <bitops.h>
#include <string.h>
/*
* bitmaps provide bit arrays that consume one or more unsigned
* longs. The bitmap interface and available operations are listed
* here, in bitmap.h
*
* Function implementations generic to all architectures are in
* lib/bitmap.c. Functions implementations that are architecture
* specific are in various include/asm-<arch>/bitops.h headers
* and other arch/<arch> specific files.
*
* See lib/bitmap.c for more details.
*/
/*
* The available bitmap operations and their rough meaning in the
* case that the bitmap is a single unsigned long are thus:
*
* Note that nbits should be always a compile time evaluable constant.
* Otherwise many inlines will generate horrible code.
*
* bitmap_zero(dst, nbits) *dst = 0UL
* bitmap_fill(dst, nbits) *dst = ~0UL
* bitmap_copy(dst, src, nbits) *dst = *src
* bitmap_and(dst, src1, src2, nbits) *dst = *src1 & *src2
* bitmap_or(dst, src1, src2, nbits) *dst = *src1 | *src2
* bitmap_xor(dst, src1, src2, nbits) *dst = *src1 ^ *src2
* bitmap_andnot(dst, src1, src2, nbits) *dst = *src1 & ~(*src2)
* bitmap_complement(dst, src, nbits) *dst = ~(*src)
* bitmap_equal(src1, src2, nbits) Are *src1 and *src2 equal?
* bitmap_intersects(src1, src2, nbits) Do *src1 and *src2 overlap?
* bitmap_subset(src1, src2, nbits) Is *src1 a subset of *src2?
* bitmap_empty(src, nbits) Are all bits zero in *src?
* bitmap_full(src, nbits) Are all bits set in *src?
* bitmap_weight(src, nbits) Hamming Weight: number set bits
* bitmap_set(dst, pos, nbits) Set specified bit area
* bitmap_clear(dst, pos, nbits) Clear specified bit area
* bitmap_find_next_zero_area(buf, len, pos, n, mask) Find bit free area
* bitmap_shift_right(dst, src, n, nbits) *dst = *src >> n
* bitmap_shift_left(dst, src, n, nbits) *dst = *src << n
* bitmap_remap(dst, src, old, new, nbits) *dst = map(old, new)(src)
* bitmap_bitremap(oldbit, old, new, nbits) newbit = map(old, new)(oldbit)
* bitmap_onto(dst, orig, relmap, nbits) *dst = orig relative to relmap
* bitmap_fold(dst, orig, sz, nbits) dst bits = orig bits mod sz
* bitmap_scnprintf(buf, len, src, nbits) Print bitmap src to buf
* bitmap_parse(buf, buflen, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parse_user(ubuf, ulen, dst, nbits) Parse bitmap dst from user buf
* bitmap_scnlistprintf(buf, len, src, nbits) Print bitmap src as list to buf
* bitmap_parselist(buf, dst, nbits) Parse bitmap dst from kernel buf
* bitmap_parselist_user(buf, dst, nbits) Parse bitmap dst from user buf
* bitmap_find_free_region(bitmap, bits, order) Find and allocate bit region
* bitmap_release_region(bitmap, pos, order) Free specified bit region
* bitmap_allocate_region(bitmap, pos, order) Allocate specified bit region
*/
/*
* Also the following operations in asm/bitops.h apply to bitmaps.
*
* set_bit(bit, addr) *addr |= bit
* clear_bit(bit, addr) *addr &= ~bit
* change_bit(bit, addr) *addr ^= bit
* test_bit(bit, addr) Is bit set in *addr?
* test_and_set_bit(bit, addr) Set bit and return old value
* test_and_clear_bit(bit, addr) Clear bit and return old value
* test_and_change_bit(bit, addr) Change bit and return old value
* find_first_zero_bit(addr, nbits) Position first zero bit in *addr
* find_first_bit(addr, nbits) Position first set bit in *addr
* find_next_zero_bit(addr, nbits, bit) Position next zero bit in *addr >= bit
* find_next_bit(addr, nbits, bit) Position next set bit in *addr >= bit
*/
/*
* The DECLARE_BITMAP(name,bits) macro, in linux/types.h, can be used
* to declare an array named 'name' of just enough unsigned longs to
* contain all bit positions from 0 to 'bits' - 1.
*/
/*
* lib/bitmap.c provides these functions:
*/
#define __user
#define __force
#define u32 uint32_t
extern int __bitmap_empty(const unsigned long *bitmap, int bits);
extern int __bitmap_full(const unsigned long *bitmap, int bits);
extern int __bitmap_equal(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_complement(unsigned long *dst, const unsigned long *src,
int bits);
extern void __bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern void __bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int shift, int bits);
extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_intersects(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_subset(const unsigned long *bitmap1,
const unsigned long *bitmap2, int bits);
extern int __bitmap_weight(const unsigned long *bitmap, int bits);
extern void bitmap_set(unsigned long *map, int i, int len);
extern void bitmap_clear(unsigned long *map, int start, int nr);
extern unsigned long bitmap_find_next_zero_area(unsigned long *map,
unsigned long size,
unsigned long start,
unsigned int nr,
unsigned long align_mask);
extern int bitmap_scnprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user,
unsigned long *dst, int nbits);
extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern int bitmap_scnlistprintf(char *buf, unsigned int len,
const unsigned long *src, int nbits);
extern int bitmap_parselist(const char *buf, unsigned long *maskp,
int nmaskbits);
extern int bitmap_parselist_user(const char __user *ubuf, unsigned int ulen,
unsigned long *dst, int nbits);
extern void bitmap_remap(unsigned long *dst, const unsigned long *src,
const unsigned long *old, const unsigned long *new, int bits);
extern int bitmap_bitremap(int oldbit,
const unsigned long *old, const unsigned long *new, int bits);
extern void bitmap_onto(unsigned long *dst, const unsigned long *orig,
const unsigned long *relmap, int bits);
extern void bitmap_fold(unsigned long *dst, const unsigned long *orig,
int sz, int bits);
extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order);
extern void bitmap_release_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order);
extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits);
#define BITMAP_FIRST_WORD_MASK(start) (~0UL << ((start) % BITS_PER_LONG))
#define BITMAP_LAST_WORD_MASK(nbits) \
( \
((nbits) % BITS_PER_LONG) ? \
(1UL<<((nbits) % BITS_PER_LONG))-1 : ~0UL \
)
#define small_const_nbits(nbits) \
(__builtin_constant_p(nbits) && (nbits) <= BITS_PER_LONG)
static inline void bitmap_zero(unsigned long *dst, int nbits)
{
if (small_const_nbits(nbits))
*dst = 0UL;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memset(dst, 0, len);
}
}
static inline void bitmap_fill(unsigned long *dst, int nbits)
{
size_t nlongs = BITS_TO_LONGS(nbits);
if (!small_const_nbits(nbits)) {
int len = (nlongs - 1) * sizeof(unsigned long);
memset(dst, 0xff, len);
}
dst[nlongs - 1] = BITMAP_LAST_WORD_MASK(nbits);
}
static inline void bitmap_copy(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = *src;
else {
int len = BITS_TO_LONGS(nbits) * sizeof(unsigned long);
memcpy(dst, src, len);
}
}
static inline int bitmap_and(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & *src2) != 0;
return __bitmap_and(dst, src1, src2, nbits);
}
static inline void bitmap_or(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 | *src2;
else
__bitmap_or(dst, src1, src2, nbits);
}
static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src1 ^ *src2;
else
__bitmap_xor(dst, src1, src2, nbits);
}
static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return (*dst = *src1 & ~(*src2)) != 0;
return __bitmap_andnot(dst, src1, src2, nbits);
}
static inline void bitmap_complement(unsigned long *dst, const unsigned long *src,
int nbits)
{
if (small_const_nbits(nbits))
*dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_complement(dst, src, nbits);
}
static inline int bitmap_equal(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_equal(src1, src2, nbits);
}
static inline int bitmap_intersects(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0;
else
return __bitmap_intersects(src1, src2, nbits);
}
static inline int bitmap_subset(const unsigned long *src1,
const unsigned long *src2, int nbits)
{
if (small_const_nbits(nbits))
return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_subset(src1, src2, nbits);
}
static inline int bitmap_empty(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (*src & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_empty(src, nbits);
}
static inline int bitmap_full(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits));
else
return __bitmap_full(src, nbits);
}
static inline int bitmap_weight(const unsigned long *src, int nbits)
{
if (small_const_nbits(nbits))
return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits));
return __bitmap_weight(src, nbits);
}
static inline void bitmap_shift_right(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = *src >> n;
else
__bitmap_shift_right(dst, src, n, nbits);
}
static inline void bitmap_shift_left(unsigned long *dst,
const unsigned long *src, int n, int nbits)
{
if (small_const_nbits(nbits))
*dst = (*src << n) & BITMAP_LAST_WORD_MASK(nbits);
else
__bitmap_shift_left(dst, src, n, nbits);
}
static inline int bitmap_parse(const char *buf, unsigned int buflen,
unsigned long *maskp, int nmaskbits)
{
return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits);
}
#endif /* __LINUX_BITMAP_H */

View File

@ -27,6 +27,31 @@ unsigned long find_first_bit(const unsigned long *addr,
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
static inline int test_bit(int nr, const void *addr)
{
const uint32_t *p = (const uint32_t *)addr;
return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0;
}
extern unsigned int __sw_hweight32(unsigned int w);
extern unsigned int __sw_hweight16(unsigned int w);
extern unsigned int __sw_hweight8(unsigned int w);
extern unsigned long __sw_hweight64(uint64_t w);
static inline unsigned long hweight_long(unsigned long w)
{
return sizeof(w) == 4 ? __sw_hweight32(w) : __sw_hweight64(w);
}
#define BIT(nr) (1UL << (nr))
#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
#define BITS_PER_BYTE 8
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask))
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1)
#define ALIGN(x, a) __ALIGN_KERNEL((x), (a))
#endif /*__ASSEMBLY__*/
#include <arch-bitops.h>

View File

@ -55,6 +55,7 @@ struct ihk_mc_cpu_info *ihk_mc_get_cpu_info(void);
void ihk_mc_boot_cpu(int cpuid, unsigned long pc);
int ihk_mc_get_processor_id(void);
int ihk_mc_get_hardware_processor_id(void);
int ihk_mc_get_numa_id(void);
void ihk_mc_delay_us(int us);
void ihk_mc_set_syscall_handler(long (*handler)(int, ihk_mc_user_context_t *));

View File

@ -72,8 +72,11 @@ struct ihk_mc_memory_node {
unsigned long ihk_mc_get_memory_address(enum ihk_mc_gma_type, int);
void ihk_mc_reserve_arch_pages(unsigned long start, unsigned long end,
void (*cb)(unsigned long, unsigned long, int));
struct ihk_page_allocator_desc;
void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long start, unsigned long end,
void (*cb)(struct ihk_page_allocator_desc *,
unsigned long, unsigned long, int));
struct ihk_mc_pa_ops {
void *(*alloc_page)(int, int, enum ihk_mc_ap_flag);
@ -100,14 +103,28 @@ void ihk_mc_map_micpa(unsigned long host_pa, unsigned long* mic_pa);
int ihk_mc_free_micpa(unsigned long mic_pa);
void ihk_mc_clean_micpa(void);
void *ihk_mc_alloc_aligned_pages(int npages, int p2align, enum ihk_mc_ap_flag flag);
void *ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag);
void ihk_mc_free_pages(void *p, int npages);
void *ihk_mc_allocate(int size, enum ihk_mc_ap_flag flag);
void *_ihk_mc_alloc_aligned_pages(int npages, int p2align,
enum ihk_mc_ap_flag flag, char *file, int line);
#define ihk_mc_alloc_aligned_pages(npages, p2align, flag) ({\
void *r = _ihk_mc_alloc_aligned_pages(npages, p2align, flag, __FILE__, __LINE__);\
r;\
})
void *_ihk_mc_alloc_pages(int npages, enum ihk_mc_ap_flag flag,
char *file, int line);
#define ihk_mc_alloc_pages(npages, flag) ({\
void *r = _ihk_mc_alloc_pages(npages, flag, __FILE__, __LINE__);\
r;\
})
void _ihk_mc_free_pages(void *ptr, int npages, char *file, int line);
#define ihk_mc_free_pages(p, npages) ({\
_ihk_mc_free_pages(p, npages, __FILE__, __LINE__);\
})
void *ihk_mc_allocate(int size, int flag);
void ihk_mc_free(void *p);
void *arch_alloc_page(enum ihk_mc_ap_flag flag);
void arch_free_page(void *ptr);
int arch_get_smaller_page_size(void *args, size_t origsize, size_t *sizep, int *p2alignp);
typedef void *page_table_t;
@ -147,10 +164,24 @@ struct page_table *ihk_mc_pt_create(enum ihk_mc_ap_flag ap_flag);
/* XXX: proper use of struct page_table and page_table_t is unknown */
void ihk_mc_pt_destroy(struct page_table *pt);
void ihk_mc_load_page_table(struct page_table *pt);
int ihk_mc_pt_virt_to_phys_size(struct page_table *pt,
const void *virt,
unsigned long *phys,
unsigned long *size);
int ihk_mc_pt_virt_to_phys(struct page_table *pt,
const void *virt, unsigned long *phys);
uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt);
int ihk_mc_get_nr_numa_nodes(void);
struct smp_coreset;
int ihk_mc_get_numa_node(int id, int *linux_numa_id, int *type);
int ihk_mc_get_numa_distance(int i, int j);
int ihk_mc_get_nr_memory_chunks(void);
int ihk_mc_get_memory_chunk(int id,
unsigned long *start,
unsigned long *end,
int *numa_id);
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id);

View File

@ -14,14 +14,24 @@
#ifndef __HEADER_GENERIC_IHK_PAGE_ALLOC
#define __HEADER_GENERIC_IHK_PAGE_ALLOC
#include <list.h>
/* XXX: Physical memory management shouldn't be part of IHK */
struct ihk_mc_numa_node {
int id;
int linux_numa_id;
int type;
struct list_head allocators;
};
struct ihk_page_allocator_desc {
unsigned long start;
unsigned long start, end;
unsigned int last;
unsigned int count;
unsigned int flag;
unsigned int shift;
ihk_spinlock_t lock;
unsigned int pad;
struct list_head list;
unsigned long map[0];
};

View File

@ -54,11 +54,11 @@ int ihk_mc_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_stop(unsigned long counter_mask);
int ihk_mc_perfctr_fixed_init(int counter, int mode);
int ihk_mc_perfctr_reset(int counter);
int ihk_mc_perfctr_set(int counter, unsigned long value);
int ihk_mc_perfctr_set(int counter, long value);
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value);
unsigned long ihk_mc_perfctr_read(int counter);
unsigned long ihk_mc_perfctr_read_msr(int counter);
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status);
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status);
#endif

View File

@ -14,6 +14,7 @@
#define __STRING_H
#include <types.h>
#include <arch-string.h>
size_t strlen(const char *p);
size_t strnlen(const char *p, size_t maxlen);
@ -29,6 +30,17 @@ void *memcpy_long(void *dest, const void *src, size_t n);
int memcmp(const void *s1, const void *s2, size_t n);
void *memset(void *s, int n, size_t l);
#ifdef ARCH_FAST_MEMCPY
#define fast_memcpy __inline_memcpy
#else
#define fast_memcpy memcpy
#endif
extern int snprintf(char * buf, size_t size, const char *fmt, ...);
extern int sprintf(char * buf, const char *fmt, ...);
extern int sscanf(const char * buf, const char * fmt, ...);
extern int scnprintf(char * buf, size_t size, const char *fmt, ...);
unsigned long strtol(const char *cp, char **endp, unsigned int base);
int flatten_strings(int nr_strings, char *first, char **strings, char **flat);
int flatten_strings_from_user(int nr_strings, char *first, char **strings, char **flat);

View File

@ -52,7 +52,7 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
desc = initial;
*pdescsize = descsize;
} else {
desc = (void *)allocate_pages(descsize, IHK_MC_AP_CRITICAL);
desc = (void *)ihk_mc_alloc_pages(descsize, IHK_MC_AP_CRITICAL);
}
if (!desc) {
kprintf("IHK: failed to allocate page-allocator-desc "\
@ -64,13 +64,14 @@ void *__ihk_pagealloc_init(unsigned long start, unsigned long size,
memset(desc, 0, descsize * PAGE_SIZE);
desc->start = start;
desc->end = start + size;
desc->last = 0;
desc->count = mapaligned >> 3;
desc->shift = page_shift;
desc->flag = flag;
kprintf("Page allocator: %lx - %lx (%d)\n", start, start + size,
page_shift);
//kprintf("page allocator @ %lx - %lx (%d)\n", start, start + size,
// page_shift);
ihk_mc_spinlock_init(&desc->lock);
@ -92,7 +93,7 @@ void ihk_pagealloc_destroy(void *__desc)
{
struct ihk_page_allocator_desc *desc = __desc;
free_pages(desc, desc->flag);
ihk_mc_free_pages(desc, desc->flag);
}
static unsigned long __ihk_pagealloc_large(struct ihk_page_allocator_desc *desc,