Compare commits

...

391 Commits
0.9.0 ... 1.1.3

Author SHA1 Message Date
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
f74dcfc2a1 Modify mcreboot.sh for job scheduler
1. Don't complain when logname command doesn't exist
2016-09-01 19:27:18 +09:00
7c562d0539 support madvise(MADV_DONTFORK) 2016-09-01 11:22:53 +09:00
b5e4459a34 support AVX-512 registers 2016-08-30 18:39:33 +09:00
782122b681 mcctrl: fix to rus_vm_fault() call by kworker process 2016-08-22 13:00:28 +09:00
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
1ce1b17a85 Specify facility used by mcklogd via option
1. You can specify facility through -f option of mcreboot.sh.
   Example:
   mcreboot.sh -k 1 -f LOG_LOCAL6
   Note that you need to specify "-k 1" or "-k 2" to start mcklogd.
2. Kill mcklogd if needed in mcreboot.sh and mcstop+release.sh.
2016-08-17 17:52:44 +09:00
a2456c3ed2 Modify mcstop+release.sh for job scheduler
1. Remove ihk.ko
2. Output message to stderr and return one on error
2016-08-17 17:32:06 +09:00
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
15783f09a0 Modify mcreboot.sh for job scheduler
1. Add an option to specify owner of device files
2. Output message to stderr and return one on error
2016-08-17 15:07:13 +09:00
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
dac6f2883e mcctrl procfs: use semaphores instead of spinlocks to avoid sleeping in GFP_KERNEL kmalloc() in atomic context 2016-08-01 20:33:51 +09:00
c484f766fa schedule(): schedule a sleeping processes if it has pending signals 2016-07-28 11:42:00 +09:00
57690479bd read/patch_process_vm(): map non-LWK physical addresses properly 2016-07-22 20:48:54 +09:00
d0539a9cac eclair: make idle threads visible 2016-07-22 18:06:11 +09:00
4c8f583c0c split_large_page(): avoid panic when splitting "non-mapped" large pages 2016-07-14 17:11:52 +09:00
6118faffa9 pager_req_pfn(): use FAULT_FLAG_USER only if defined 2016-07-13 18:05:31 +09:00
dad6470c60 clone_thread: fork(2) copy sigstack infos from parent 2016-07-13 16:15:01 +09:00
46c37fc8f3 setfsgid: fix to didn't change fsgid 2016-07-13 15:54:52 +09:00
f6908f21a8 do_kill: wake PS_INTERRUPTIBLE process when send SIGKILL
sched_wakeup_thread: don't change process status if process status is PS_EXITED
2016-07-13 14:06:32 +09:00
01d9d9a5ba devobj: allow arbitrary size device file mappings 2016-07-12 17:02:19 +09:00
c43d993a4d mcstop+release-smp-x86.sh.in: unload mcctrl after OS shutdown 2016-07-11 16:40:06 +09:00
7d9bbecd7a mcctrl: use IHK OS notifiers to establish/tear down syscall channels
This patch eliminates the need for rmmod/insmod the mcctrl module
every time an OS instance is rebooted.
2016-07-11 16:22:50 +09:00
d135731398 do_syscall(): allow schedule for another thread (Intel MPI+OpenMP issue) 2016-07-05 18:54:51 +09:00
5c190beb04 save fpregs when to call sighandler
refs #50
2016-07-05 15:26:00 +09:00
fc66556f9f mcexec: error handling and propagation 2016-06-24 15:35:38 -07:00
648bacc90f device file mappings: communicate map flags and fault missing translations 2016-06-24 12:44:59 -07:00
dd37443fc7 PAPI support: performance counter's overflow.
and support mckfd fcntl.
2016-06-24 13:50:12 +09:00
e34322702a x86_init_perfctr: discover perf counters dynamically from MSRs 2016-06-22 10:47:57 -07:00
e12997e6a9 mcreboot: support for CPU cores (-c) and memory (-m) arguments 2016-06-21 09:10:06 -07:00
fabaa806d3 Revert "Make executor code include executer/config.h": breaks out-of-tree compile
This reverts commit d90900b6e6.
2016-06-21 08:51:45 +09:00
a83ad620c8 devobj: allow read only device file mappings (OFED 3.3 support) 2016-06-21 06:57:59 +09:00
d90900b6e6 Make executor code include executer/config.h
Make the code "executer/kernel/mcctrl/arch/x86_64/archdeps.c"
to include "executer/config.h" instead of
non-existent "executer/kernel/mcctrl/config.h".
2016-06-09 18:40:39 +09:00
6d9a88e9f4 binfmt_mcexec: support post-K specification 2016-06-08 09:53:39 +09:00
d0ee60f9e3 mcoverlayfs: supported only Linux kernel 4.0 2016-06-03 18:36:55 +09:00
14ec92518e KVM support: detect KVM and avoid touching unimplemented MSRs 2016-05-26 01:11:08 +09:00
435e2bdeb4 support for Linux 4.6: use get_user_pages_remote() 2016-05-24 09:39:04 +09:00
f06d8041e3 don't send SIGCONT when sending SIGSTOP derived from PTRACE_ATTACH
refs #747
2016-05-19 10:54:12 +09:00
9b35eaca42 remote_flush_tlb_cpumask() dead locking
refs #728
2016-05-10 14:02:25 +09:00
130b1f4327 update PAPI support. other process and child process monitoring. 2016-04-26 19:01:47 +09:00
921280f85c Docker support: use task_XX_vnr() functions for accessing correct namespace 2016-04-21 09:59:49 -07:00
d4a0b32f06 support large pages 2016-04-21 23:22:55 +09:00
b3bec32e99 update_process_page_table: refactor 2016-04-21 23:22:55 +09:00
2048980820 remove ihk_mc_pt_alloc_range() 2016-04-21 23:22:54 +09:00
176f6d23a9 ihk_mc_pt_virt_to_pagemap: refactor 2016-04-21 23:22:54 +09:00
328175547f Revert "fix REQ-37: remap_one_page: remove to check page size"
This reverts commit 6790126a23.

- reverted commit should remove a 'pgsize' check in remap_one_page()
  instead of a 'pgsize' check in pte_make_fileoff().
- In IA-32e, PTE format varies with page size. Therefore 'pgsize'
  parameter of pte_make_fileoff() is preferable.
2016-04-21 23:22:54 +09:00
e2e0fad849 arch_clear_host_user_space: set zero to args[2]
to avoid duplicated per_proc_list entry.
2016-04-21 23:22:54 +09:00
397bf3f4a6 wait_zombie: don't wait attached process
refs #726
2016-04-21 20:28:36 +09:00
aa77228453 resupport ptrace(PTRACE_ATTACH)
refs #733
2016-04-21 20:13:27 +09:00
82cb8f95ed update PAPI support. 2016-04-18 13:07:45 +09:00
3f2b4e7282 do_wait: unlink child from children_list if child terminated
refs #724
2016-04-14 10:25:12 +09:00
d6784bb4a5 update auto-generated files 2016-04-11 22:25:53 +09:00
1bb948f43b hwloc support 2016-04-11 22:25:27 +09:00
2a1823d52c vdso: set enable bit of pvti_msr 2016-04-11 22:20:39 +09:00
89943dc5ba vdso: set physical address at pvti_msr 2016-04-11 22:20:39 +09:00
fceb02a44a vdso: add zero clear for pvti 2016-04-11 22:20:38 +09:00
7298d8e179 vdso: correct pvti array element type
struct pvclock_vsyscall_time_info <-- struct pvclock_vcpu_time_info
2016-04-11 22:20:38 +09:00
6f32544dde vdso: add static cast 2016-04-11 22:20:38 +09:00
10d248b3cc mcexec: include config.h 2016-04-11 22:20:38 +09:00
fb32120659 make mcoverlayfs optional (default: enabled) 2016-04-02 15:43:35 -04:00
73de203c16 update auto-generated files 2016-03-28 22:57:45 +09:00
41bb2ab5e6 support vdso which borrows clocksource from linux 2016-03-28 22:57:44 +09:00
a587c8f5e5 x86: encode cpu# in IA32_TSC_AUX and size of GDTe#15 2016-03-28 22:57:44 +09:00
0c53a5ca35 add NOPHYS which means no physical memory 2016-03-28 22:57:44 +09:00
c760a01a79 add pte_get_attr() 2016-03-28 22:57:44 +09:00
a2c29e8abf correct the value of tod_data.origin
tod_data.origin should hold a time when TSC is zero.
2016-03-28 22:57:39 +09:00
18add6a9bd shmctl(IPC_RMID): fix wrong owner/creator checking (revised)
Don't check owner/creator of the segment in case of superuser.
2016-03-28 16:02:24 +09:00
a083e6c2bf Revert "shmctl(IPC_RMID): fix wrong owner/creator checking"
This reverts commit 8b5b075f4c.

The reverted commit modifies IPC_SET instead of IPC_RMID.
2016-03-28 16:00:39 +09:00
a2548f5421 Revert "fix REQ-42"
This reverts commit 4a0682bbc1.

The reverted commit appears to be wrong, for example:
- arch_range_check()'s arguments and parameters are mismatch.
- arch_range_check() implementation is not checking range.

Conflicts:
	kernel/syscall.c
2016-03-28 13:51:57 +09:00
6790126a23 fix REQ-37: remap_one_page: remove to check page size 2016-03-27 14:05:00 +09:00
1195549f41 fix REQ-19: some syscalls change how to access user space 2016-03-27 11:43:53 +09:00
b0096a2740 fix REQ-51 2016-03-26 12:23:51 +09:00
a11479eba8 fix REQ-48 2016-03-25 13:05:53 +09:00
12eaea401e fix REQ-46 2016-03-25 12:59:18 +09:00
31595b7409 fix REQ-43 2016-03-25 12:57:31 +09:00
4a0682bbc1 fix REQ-42 2016-03-24 19:14:50 +09:00
932a287437 fix REQ-40 2016-03-24 13:46:13 +09:00
670741ae40 fix REQ-39 2016-03-24 13:45:15 +09:00
70b27e06ff eclair: change default kernel to ./mckernel.img 2016-03-23 20:00:57 +09:00
4c38ddb623 update auto-generated files 2016-03-23 20:00:57 +09:00
6f00ddced6 move eclair from ihk repository 2016-03-23 20:00:57 +09:00
c0eecd63c9 update auto-generated files 2016-03-23 20:00:57 +09:00
1fd0b03e78 move config.h.in
from executer/kernel/mcctrl/config.h.in
to   executer/config.h.in
2016-03-23 20:00:57 +09:00
6c59de9300 expand AC_PROT_CC only once 2016-03-23 20:00:57 +09:00
b1309a5d53 map PIE at map_end instead of at user_start 2016-03-23 19:14:28 +09:00
489cd6d1a2 refactor prepare_process_ranges_args_envs() 2016-03-23 19:14:28 +09:00
c9cc4330c8 mincore: take into account pages cached in memobj 2016-03-23 19:14:28 +09:00
604f846cd2 mincore: check [start..start+len) is in user region 2016-03-23 19:14:28 +09:00
e939cf6862 mincore: cosmetic changes 2016-03-23 19:14:28 +09:00
72f2e5ebe0 shmobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
bd7dddd415 fileobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
fbd9dc878b memobj: add lookup_page method 2016-03-23 19:14:28 +09:00
d6c51ff997 treat memory devices as regular files,
to enable processes to mmap() /dev/zero
2016-03-23 19:14:27 +09:00
86ac51157c add error checks to shmctl(SHM_UNLOCK) 2016-03-23 19:14:27 +09:00
b73fa2b972 add error checks to shmctl(SHM_LOCK) 2016-03-23 19:14:27 +09:00
798f69bceb add has_cap_ipc_lock() 2016-03-23 19:14:27 +09:00
e8be52a1ff shm: trace the amount of locked segment per user 2016-03-23 19:14:27 +09:00
8b5b075f4c shmctl(IPC_RMID): fix wrong owner/creator checking
Don't check owner/creator of the segment in case of superuser.
2016-03-23 19:14:27 +09:00
b214fc278a add has_cap_sys_admin() 2016-03-23 19:14:27 +09:00
b3ae7f46bd add rlim_t (a type of rlim_cur and rlim_max) 2016-03-23 19:14:27 +09:00
48167d3223 shmget: add "shmflg" checks for SHM_HUGE* 2016-03-23 19:14:27 +09:00
d65135c040 move sys_shmget() into arch-dependent code 2016-03-23 19:14:27 +09:00
1761acc4c3 eliminate geteuid(), getegid() and getpid() 2016-03-23 19:04:32 +09:00
d4d93df032 mmap: add "flags" checks for MAP_HUGE* 2016-03-23 19:04:32 +09:00
261bddb999 add a member pgshift into struct vm_range
pgshift indicates a page size in the range.
2016-03-23 19:04:32 +09:00
1a3bc851af mprotect: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
15f572ef9c mmap: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
81690c5b5a mmap: cosmetic changes 2016-03-23 19:04:32 +09:00
832c0f9afd refactor copy_user_ranges() 2016-03-23 19:04:32 +09:00
f92cac7751 add type casting to the argument of getlong_user() 2016-03-23 19:04:32 +09:00
e74eb1dd51 add some prototypes to <memory.h> 2016-03-23 19:04:32 +09:00
8f7b9072ea refactor some copyin/copyout functions
- copy_from_user()
- getlong_user()
- getint_user()
- copy_to_user()
- setlong_user()
- setint_user()
2016-03-23 19:04:32 +09:00
4595aa3079 pte_visitor_t(): change "pgsize" into "pgshift" 2016-03-23 19:04:32 +09:00
807d294ac4 signalfd4: fix initialize 2016-06-03 20:58:02 +09:00
c947dd0d49 sysfs: support /sys/devices/system/cpu/online 2016-03-22 20:25:34 +09:00
d192e6c0fe modify PAPI support 2016-03-22 15:52:59 +09:00
7dbbcb362f add PAPI support 2016-03-22 15:27:19 +09:00
593cf98015 add ACSL annotation 2016-03-16 15:42:32 +09:00
8dd9f5ef3f support profil 2016-03-12 16:47:19 +09:00
0eaf058a4f mcexec: -lrt to Makefile.in for supporting clock_gettime() on SUSE 2016-03-12 05:24:14 +09:00
1aac2c8e23 add CPU timer initialization (refs #402)
There is no actual initialization in x86 now.
The initialization rely on hardware reset and Linux initialization.
2016-03-11 19:20:37 +09:00
70e8dd7979 remove initialization of TSC (refs #362) 2016-03-11 19:17:29 +09:00
eb0700359b fix REQ-36 2016-03-10 10:33:38 +09:00
3f16a9443e ptrace_report_signal: save debug regs before to send SIGCHLD to tracer 2016-03-09 22:29:51 +09:00
bf0cf0a346 fix REQ-31 2016-03-08 15:19:03 +09:00
14b868907b fix REQ-27 2016-03-07 18:52:08 +09:00
dbc778e4fa support getrusage (work in progress) 2016-03-07 17:06:44 +09:00
7fac03d4de sysfs: support /sys/devices/system/cpu/offline,online,possible,present 2016-03-04 13:48:06 +09:00
26c0180374 rwlock_reader_lock: fix lock list jammed up 2016-03-03 22:47:48 +09:00
8ebb3a4231 schedule: migration free last thread if terminated 2016-03-03 22:44:44 +09:00
f1f1ba9c8c mcs_rwlock_reader_lock: temporary fix 2016-03-01 19:11:42 +09:00
6ce00b5f0f sysfs: samples of snooping ops 2016-02-29 19:59:04 +09:00
4ec0e02a89 sysfs: add snooping ops 2016-02-29 19:23:01 +09:00
8f9192ac36 mcctrl: workaround for out-of-tree build (2/2)
- update auto-generated file
2016-02-29 19:18:08 +09:00
80ce123ab6 mcctrl: workaround for out-of-tree build (1/2) 2016-02-29 19:18:08 +09:00
1dc8513cd3 fix REQ-20 2016-02-26 16:18:30 +09:00
b0054643c0 REQ-18 2016-02-26 16:17:23 +09:00
972ff73ecf mcexec: fix readlink
refs #692
2016-02-25 16:08:42 +09:00
1f8a859b47 mcctrl: update auto-generated files 2016-02-24 21:34:48 +09:00
2601d8a36f mcctrl: use zap_page_range() instead of madvise() 2016-02-24 21:34:48 +09:00
a713c2fcaa fix REQ-16 2016-02-24 20:58:07 +09:00
c4c5e435cc fix REQ-12 2016-02-24 20:57:45 +09:00
853b56c784 mcreboot-smp-x86.sh: add mount to ceate /tmp/mcos/linux_proc from /proc 2016-02-24 19:24:37 +09:00
863a5c5e5f fix REQ-2, REQ-6, REQ-8 2016-02-23 16:32:17 +09:00
ebce1cb031 Merge branch 'master' of postpeta.pccluster.org:mckernel 2016-02-22 13:34:00 +09:00
fff7744907 mcklogd support 2016-02-22 13:32:20 +09:00
27c3ed7e96 remove debug print 2016-02-21 15:17:42 +09:00
e2b28da32f signal handler support gdb stepi command 2016-02-21 14:55:34 +09:00
2c50b716fd support setitimer/getitimer 2016-02-19 15:25:05 +09:00
307b2b8da5 clock_gettime: support clock_id CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID 2016-02-18 17:43:13 +09:00
eba2be8a35 support times 2016-02-18 13:14:18 +09:00
a997af71be support tkill
refs #664
2016-02-17 12:48:12 +09:00
e7c37b8000 mcreboot-smp-x86.sh: fix Failed to mount /sys/devices/virtual/mcos/mcos0/sys 2016-02-16 16:05:40 +09:00
8c40f94aa8 /proc/<PID>/mem: support read/write 2016-02-16 13:21:29 +09:00
da13bd408a mcexec: add to initialize some structures (REQ-56)
refs #718
2016-02-15 18:20:58 +09:00
c328d26b8d procfs(/proc/<PID>/task/<TID>/stat): fix memory corruption
refs #722
2016-02-15 15:10:00 +09:00
6cda6792a9 process_msg_init_acked: don't use PA 2016-02-14 22:47:52 +09:00
2d3fda1d0b flatten_strings: fix align (REQ-1) 2016-02-14 22:36:58 +09:00
5d43c135db procfs: (temporary fix) unsupported files are closed 2016-02-10 17:10:54 +09:00
a866192db7 refactoring /proc 2016-02-10 08:11:02 +09:00
c0cc6ac6db Add skeleton for perf_event_open. 2016-02-09 14:54:53 +09:00
14c5bc08c2 mcexec: check Linux version from actual kernel tree instead of system wide include 2016-02-09 14:07:08 +09:00
7f01d273d0 mcctrl: fix out-of-tree build (not finding config.h) 2016-02-09 12:45:58 +09:00
137e0a799c mcexec: unshare and mount request through mcctrl 2016-02-08 16:27:03 +09:00
f214ff1b57 mcctrl: add MCEXEC_UP_SYS_MOUNT, MCEXEC_UP_SYS_UNSHARE 2016-02-08 16:00:52 +09:00
0ce698eb1f mcexec: support for /sys mounted by mcoverlayfs 2016-02-08 11:36:03 +09:00
e601248bdc procfs: fix mcos%d/PID/auxv size 2016-02-08 09:38:27 +09:00
d8eeab9b89 mcoverlayfs: enable out of tree compilation 2016-02-01 00:35:53 +09:00
fdf031ac16 procfs: chown procfs entries (temporary hack)
refs #651
refs #699
2016-01-28 16:29:46 +09:00
1ffe740153 sysfs sample 2016-01-26 18:08:25 +09:00
72968d613e support sysfs interface for mcctrl 2016-01-26 18:08:25 +09:00
2e98f875c3 sysfs: attempt to remove empty directories only 2016-01-26 18:08:25 +09:00
a6cb9a6b93 sysfs: lookup_i(): refactoring 2016-01-26 18:08:25 +09:00
da0a91b9f7 mcctrl: denote full path in /proc/PID/exe 2016-01-26 16:21:52 +09:00
f093786bec x86: populating PML4e and PDPTe is now lock-free 2016-01-25 09:17:06 +09:00
368f155328 sigaction: support SA_NODEFER
refs #698
2016-01-21 18:48:10 +09:00
425f920013 mcctrl: delete procfs entries recursively to avoid leaking 2016-01-21 18:15:59 +09:00
dbddf37579 set termsig to mcexec spawned process 2016-01-21 12:08:47 +09:00
fa7a5ccd11 support /proc/self/exe (needed for GDB to attach to an existing process) 2016-01-19 18:23:02 +09:00
172bf0a389 sched_setaffinity: add permission check 2016-01-15 12:05:18 +09:00
9bafd166e3 futex: support FUTEX_CLOCK_REALTIME 2016-01-14 16:18:49 +09:00
2e31b8abd1 clock_gettime: clock_id != CLOCK_REALTIME -> offload to linux 2016-01-13 14:04:06 +09:00
a42ee00101 NR_execve: initialize local variable 'shell'
refs #696
2016-01-13 11:16:19 +09:00
f6935b0869 ptrace_setsiginfo: update recieved siginfo 2016-01-11 17:37:29 +09:00
03a7763a5e ptrace_conf: set received siginfo to default siginfo 2016-01-11 17:10:30 +09:00
3a2f7b0106 clone: support CLONE_PARENT 2016-01-11 16:49:02 +09:00
2819ec2197 fix extra copy which might cause page faults 2016-01-06 21:12:57 +09:00
f7d81a9281 fix typo 2016-01-06 21:12:57 +09:00
914faf042d add missing kfree() for channel lookup table 2016-01-06 21:12:57 +09:00
75c6a94839 delete struct member 'type' from address_space structure 2016-01-06 20:17:00 +09:00
f7b5b48266 support x2apic 2016-01-06 13:53:02 +09:00
f9bd83c726 ptrace: fix PTRACE_GETREGSET, PTRACE_SETREGSET bug
refs #608
2015-12-28 19:45:50 +09:00
edc275ce4f delete free_list_lock 2015-12-28 11:31:42 +09:00
d00ea61d1a ptrace_wakeup_sig: fix thread lock 2015-12-28 10:33:07 +09:00
01117e92c9 append file path to symlink if link path is absolute
refs #643
2015-12-25 15:50:39 +09:00
d477096cb0 getrlimit, setrlimit: offload to linux when an unknown parameter was specified
refs #660
2015-12-25 15:35:33 +09:00
f44ddfa3b3 support sigtimedwait 2015-12-24 12:35:45 +09:00
e0acd254b1 do_process_vm_read_writev: use process hash for remote process search 2015-12-22 09:47:00 +09:00
d0507f7e9f process_read/write_vm(): fix LTP bugs 2015-12-18 15:58:51 +09:00
0f8b2aba22 reset signal handlers when execve called 2015-12-18 12:46:53 +09:00
7e5c7445e2 fix ptrace_detach bug
refs #662
2015-12-16 17:41:57 +09:00
a055fb525d sysfs sample 2015-12-16 13:42:30 +09:00
8cb72df663 support McKernel's sysfs tree 2015-12-16 13:42:30 +09:00
e805249651 add strrchr() 2015-12-16 13:42:30 +09:00
06a7889e1f chown root mcexec 2015-12-15 16:22:14 +09:00
20deed09f0 mcexec: support for /proc mounted by mcoverlayfs 2015-12-14 14:47:05 +09:00
bb81f84709 support PIE executable for PVAS 2015-12-14 11:05:28 +09:00
5c1dad1660 GDB: async-shell.exp
refs #650
2015-11-26 17:07:13 +09:00
7f2220b8e9 set '\0' termination to readlink result.
refs #643
2015-11-26 16:58:15 +09:00
65dda3f24e mcoverlayfs: support mount options(nocopyupw, nofscheck) 2015-11-25 15:34:58 +09:00
544971d665 modify for PVAS 2015-11-25 14:27:20 +09:00
dbddab4356 mcoverlayfs: add overlayfs of the original(kernel 4.0.9) 2015-11-25 13:23:49 +09:00
12eb8a9bb0 mcctrl: move mcctrl to executer/kernel/mcctrl 2015-11-24 15:42:04 +09:00
828a3ea57a futex(): support for cross address-space futexes 2015-11-24 14:58:04 +09:00
eb6de9d1de delete debug code 2015-11-13 15:10:14 +09:00
42c8ef6539 do_fork(): fix CLONE_PARENT_SETTID bug 2015-11-13 12:46:09 +09:00
780d4fc29b futex_wait(): support for FUTEX_CLOCK_REALTIME 2015-11-13 12:46:02 +09:00
94fcc5bb9a futex_wait: add to check signal 2015-11-12 09:38:36 +09:00
e822fc47dd fix dead locking when kill subthreads 2015-11-11 23:03:43 +09:00
26492a2895 vsyscall_gettimeofday: make timeval from TSC 2015-11-11 19:45:14 +09:00
1a5ff7f535 gettimeofday: gather variables into new struct 2015-11-11 18:31:33 +09:00
4c181d7fc0 smp-x86: add supports for dump analyzer 2015-11-09 16:06:55 +09:00
be78eb752e time_init: fix zero divide on KVM 2015-11-06 19:31:42 +09:00
0ad7c8ac50 nanosleep: fix arguments to be delegated 2015-11-06 19:31:42 +09:00
e9458a6cd3 fix ptrace02 failed 2015-10-30 16:59:03 +09:00
9e3b0b5866 bug fix 'GDB: missing parent-child relationship'
refs #641
2015-10-30 15:06:27 +09:00
0eaa27291a thread: move clear_child_tid, etc. to main structure 2015-10-29 11:01:27 +09:00
0b07dd1b79 support madvise(MADV_REMOVE) partially
This MADV_REMOVE works with a mapping which is
- created with shmat() and
- not sharing memobj with other mappings.
2015-10-28 18:41:28 +09:00
c25f8c7a39 support settimeofday() 2015-10-27 19:21:50 +09:00
9e53ae20d4 add memory barriers
- rmb()
- wmb()
2015-10-27 19:21:50 +09:00
09c9ee58d1 add 64bit atomic operations
- ihk_atomic64_t
- IHK_ATOMIC64_INIT()
- ihk_atomic64_read()
- ihk_atomic64_inc()
2015-10-27 19:21:50 +09:00
153a59a6f4 gettimeofday: avoid per-cpu data in calculation
Because it is difficult to safely update per-cpu data of other cpus in
settimeofday().
2015-10-27 19:21:50 +09:00
cad72a8562 when SIGXCPU or SIGXFSZ, set coredump bit to exit status 2015-10-22 20:57:37 +09:00
343bfbd30a rename back status field 2015-10-22 20:26:50 +09:00
4e4f1208f7 delete unused member 2015-10-19 20:12:26 +09:00
a325a78866 refactoring to send signal 2015-10-15 17:10:02 +09:00
6ae99454da delete debug print 2015-10-15 06:51:41 +09:00
04e193de13 refactoring process structures 2015-10-13 23:04:08 +09:00
2ca46fabfd support reader/writer lock 2015-10-02 14:05:10 +09:00
5b737b499d fix cmpxchgq operand 2015-10-02 14:04:05 +09:00
cb4f3a4d65 take into account args/envs' offset in page
- prepare_process_ranges_args_envs()
2015-10-01 21:08:42 +09:00
51789fcd38 initialize idle_vm for page faluts 2015-10-01 21:08:35 +09:00
9f50c5dc3a mcexec_wait_syscall: handle request even if signaled (reworked) 2015-09-29 19:53:40 +09:00
cd905f7ad1 Revert "mcexec_wait_syscall: handle request even if signaled"
This reverts commit d862f345be.
2015-09-29 19:52:36 +09:00
79266f6b97 x86_issue_ipi: keep interrupt disabled while issuing IPI 2015-09-29 19:10:01 +09:00
a666b69c2c make x86_issue_ipi() call wait_icr_idle() 2015-09-29 19:10:01 +09:00
47e8552eba move wait_icr_idle() before x86_issue_ipi() 2015-09-29 19:10:00 +09:00
8dd9175411 schedule: fix null pointer dereference 2015-09-29 19:10:00 +09:00
f08e0c0054 guess whether MSR_PLATFORM_INFO exists or not 2015-09-29 19:10:00 +09:00
d862f345be mcexec_wait_syscall: handle request even if signaled 2015-09-24 21:35:30 +09:00
a14768c49a kmalloc: fix missing unlock on out-of-memory path 2015-09-18 21:26:15 +09:00
56e57775e7 clone: fix error message 2015-09-18 21:26:15 +09:00
b3b752ba41 nanosleep: use copy_from_user instead of direct access 2015-09-17 21:46:32 +09:00
7b32f2f73b nanosleep: fix tscs_rem underflow issue 2015-09-17 21:46:26 +09:00
ea5a1a8693 nanosleep: update *rem whenever signaled 2015-09-17 21:44:49 +09:00
92f8fb2b2b nanosleep: use copy_to_user instead of direct access 2015-09-17 21:44:49 +09:00
a3e440414d nanosleep: cosmetic change 2015-09-17 21:44:49 +09:00
10ba03ccea mcreboot-smp-x86.sh: fix querying free irq 2015-09-17 13:19:07 +09:00
ccb7c30a05 page_fault_handler(): reenable preempt after failed PF when process is exiting 2015-09-17 10:05:32 +09:00
7dfeb8e7ce create demand-paging mapping in case of MAP_SHARED
On current McKernel, only mappings for demand paging can be shared.
Therefore, if MAP_SHARED and MAP_ANONYMOUS are specified and
anon_on_demand is disabled, then mmap(2) should create a mapping which
is for demand paging and is entirely populated with physical pages.
2015-09-16 21:38:00 +09:00
b1b706453f vsyscall: send SIGSEGV to the caller if syscall fails
On CentOS 7 (RHEL 7?), "errno" isn't set when vsyscall_gettimeofday
fails. So, in such case, vsyscall_gettimeofday send SIGSEGV to the
caller to report failure of gettimeofday operation.
2015-09-16 21:37:11 +09:00
bd5708286d make sys_gettimeofday() use copy_to_user() 2015-09-16 21:26:32 +09:00
c8a13cf213 make gettimeofday ignore NULL parameter 2015-09-16 21:26:24 +09:00
5ad0a03d18 make gettimeofday handle second parameter (timezone) 2015-09-16 21:25:29 +09:00
3819eec03f cosmetic changes
- sys_gettimeofday()
2015-09-16 21:13:12 +09:00
40b8587a8a schedule(): sync CPU_FLAG_NEED_RESCHED flag with clone and migrate 2015-09-16 19:22:40 +09:00
e7b1115572 mcreboot-smp-x86.sh: introduction of ihk_ikc_irq_core argument 2015-09-14 17:30:25 +09:00
e1a01803d0 disable demand paging on ANONYMOUS mappings unless anon_on_demand kernel argument is passed 2015-09-14 17:26:37 +09:00
69f4b0e1ad gettimeofday()/nanosleep(): check arguments, return on pending signal 2015-09-14 17:05:30 +09:00
0909a5bed5 tracee context is broken when tracee call execve 2015-09-03 10:05:25 +09:00
9dd224385e When SIGSEGV occurred on a tracee process, a tracee process freezes. 2015-09-01 17:37:56 +09:00
4176c59fd3 using d_path for solution to file path. 2015-08-28 13:01:34 +09:00
afeee5432f When envp is NULL, execve is delayed. 2015-08-28 13:00:45 +09:00
9ae5bcf46e gettimeofday(): an implementation based on CPU invariant TSC support 2015-08-24 23:53:56 +02:00
b8f166e608 mcreboot-smp-x86.sh: handle resource allocation after unloading; mcstop+release-smp-x86.sh 2015-08-22 18:55:53 +09:00
c85a9b99e1 a couple of cosmetic changes of debug messages 2015-08-22 18:53:14 +09:00
7c816a6b73 an implementation of the Mellor-Crummey Scott (MCS) lock 2015-08-20 15:26:52 +09:00
5a0cd3f53f ptrace_detach when exiting
refs #590
2015-08-18 18:03:09 +09:00
9fa62adfe7 execve(): stay compliant with locked context switching 2015-08-10 14:18:11 +09:00
f0ab8ec89a sched_request_migrate(): change CPU flags atomically 2015-08-10 12:45:59 +09:00
f4cc82578d check_need_resched(): no thread migration in IRQ context 2015-08-10 12:43:35 +09:00
9ba40dc0ff schedule(): hold runq lock for the entire duration of context switching
releasing the runq lock after loading page tables but before the actual
context switch can leave execution in an inconsistent if the current
process is descheduled from an IRQ between these two steps.
this patch holds the runq lock with IRQs disabled and makes the context
switch a single atomic operation.
2015-08-10 12:37:12 +09:00
8d6c97ea5c schedule(): disable auto thread migration 2015-08-07 16:07:31 +09:00
386f59000a mcreboot-smp-x86.sh.in: grant real user rw permission on /dev/mcos* 2015-08-07 13:33:44 +09:00
215cd370a1 ap_init(): clean up AP boot kernel messages 2015-08-07 10:57:59 +09:00
0a0e2c04a0 support for dynamically toggling time sharing when CPU is oversubscribed 2015-08-07 08:51:50 +09:00
aa191b87d3 schedule(): use XSAVE/XRSTOR and swap floating point registers in context switch 2015-08-07 08:41:00 +09:00
d5c243571f cpu_clear_and_set(): atomic CPU mask update in migration code 2015-08-06 10:49:55 +09:00
328e69a335 schedule(): do not preempt while holding spinlocks or while in offloaded syscall 2015-08-06 10:36:13 +09:00
b77755d0f7 obtain_clone_cpuid(): always start from CPU 0 and fill in cores linearily 2015-07-28 20:20:47 +09:00
d7bae14707 TEMPORARY: schedule(): move threads when core is explicitly oversubscribed 2015-07-28 20:12:58 +09:00
4e58d08f5c schedule_timeout(): give a chance to other process in spin sleep if CPU core is oversubscribed 2015-07-28 20:06:56 +09:00
9b1e691588 fix thread migration code (i.e., sched_setaffinity())
- moved migration code into idle() process and updated schedule() to detect
  when a thread has moved to another CPU in order to avoid doing housekeeping
  on behalf of the original one
- start CPU head from core 0
- keeps track of nested interrupts
2015-07-24 20:09:17 +09:00
3988b0fc61 keep track of IRQ context and don't do thread migration there 2015-07-23 16:56:58 +09:00
54eb345847 settid(): prevent modifying tid after thread migration 2015-07-23 16:51:24 +09:00
bbe7aef95b fix calling do_signal (argument lacked) 2015-07-17 10:18:43 +09:00
1ff4cf68c2 support SA_RESTART flag and restart syscall 2015-07-16 16:33:14 +09:00
1bc84d3feb modify to copy credentials 2015-07-13 15:29:26 +09:00
f7d78c8b7d sched_getaffinity(): return EINVAL for 0 lenght request (fixes LTP sched_getaffinity01) 2015-07-10 11:00:43 +09:00
7647c99cc2 do_migrate(): disable IRQ while holding migq_lock to avoid deadlocking with reschedule interrupts 2015-07-09 15:23:28 +09:00
43a774fbfc sched_setaffinity(): undo target core change, avoid abort on length mismatch 2015-07-09 11:00:26 +09:00
140 changed files with 32152 additions and 7885 deletions

View File

@ -1,9 +1,11 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@(cd executer/kernel; make modules)
@(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -16,7 +18,8 @@ all::
esac
install::
@(cd executer/kernel; make install)
@(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -44,7 +47,11 @@ install::
;; \
smp-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
@ -55,7 +62,8 @@ install::
esac
clean::
@(cd executer/kernel; make clean)
@(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \

View File

@ -10,7 +10,7 @@
* HISTORY
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)

File diff suppressed because it is too large Load Diff

View File

@ -78,11 +78,11 @@ int get_prstatus_size(void)
* \brief Fill a prstatus structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs0 A pointer to a x86_regs structure.
*/
void fill_prstatus(struct note *head, struct process *proc, void *regs0)
void fill_prstatus(struct note *head, struct thread *thread, void *regs0)
{
void *name;
struct elf_prstatus64 *prstatus;
@ -160,11 +160,11 @@ int get_prpsinfo_size(void)
* \brief Fill a prpsinfo structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
void fill_prpsinfo(struct note *head, struct thread *thread, void *regs)
{
void *name;
struct elf_prpsinfo64 *prpsinfo;
@ -176,8 +176,8 @@ void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
memcpy(name, "CORE", sizeof("CORE"));
prpsinfo = (struct elf_prpsinfo64 *)(name + align32(sizeof("CORE")));
prpsinfo->pr_state = proc->ftn->status;
prpsinfo->pr_pid = proc->ftn->pid;
prpsinfo->pr_state = thread->status;
prpsinfo->pr_pid = thread->proc->pid;
/*
We leave most of the fields unfilled.
@ -210,11 +210,11 @@ int get_auxv_size(void)
* \brief Fill an AUXV structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_auxv(struct note *head, struct process *proc, void *regs)
void fill_auxv(struct note *head, struct thread *thread, void *regs)
{
void *name;
void *auxv;
@ -225,7 +225,7 @@ void fill_auxv(struct note *head, struct process *proc, void *regs)
name = (void *) (head + 1);
memcpy(name, "CORE", sizeof("CORE"));
auxv = name + align32(sizeof("CORE"));
memcpy(auxv, proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
memcpy(auxv, thread->proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
}
/**
@ -243,23 +243,23 @@ int get_note_size(void)
* \brief Fill the NOTE segment.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_note(void *note, struct process *proc, void *regs)
void fill_note(void *note, struct thread *thread, void *regs)
{
fill_prstatus(note, proc, regs);
fill_prstatus(note, thread, regs);
note += get_prstatus_size();
fill_prpsinfo(note, proc, regs);
fill_prpsinfo(note, thread, regs);
note += get_prpsinfo_size();
fill_auxv(note, proc, regs);
fill_auxv(note, thread, regs);
}
/**
* \brief Generate an image of the core file.
*
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
* \param coretable(out) An array of core chunks.
* \param chunks(out) Number of the entires of coretable.
@ -271,7 +271,18 @@ void fill_note(void *note, struct process *proc, void *regs)
* should be zero.
*/
int gencore(struct process *proc, void *regs,
/*@
@ requires \valid(thread);
@ requires \valid(regs);
@ requires \valid(coretable);
@ requires \valid(chunks);
@ behavior success:
@ ensures \result == 0;
@ assigns coretable;
@ behavior failure:
@ ensures \result == -1;
@*/
int gencore(struct thread *thread, void *regs,
struct coretable **coretable, int *chunks)
{
struct coretable *ct = NULL;
@ -279,7 +290,7 @@ int gencore(struct process *proc, void *regs,
Elf64_Phdr *ph = NULL;
void *note = NULL;
struct vm_range *range;
struct process_vm *vm = proc->vm;
struct process_vm *vm = thread->vm;
int segs = 1; /* the first one is for NOTE */
int notesize, phsize, alignednotesize;
unsigned int offset = 0;
@ -306,7 +317,7 @@ int gencore(struct process *proc, void *regs,
unsigned long p, phys;
int prevzero = 0;
for (p = range->start; p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
prevzero = 1;
} else {
@ -326,7 +337,7 @@ int gencore(struct process *proc, void *regs,
dkprintf("we have %d segs and %d chunks.\n\n", segs, *chunks);
{
struct vm_regions region = proc->vm->region;
struct vm_regions region = thread->vm->region;
dkprintf("text: %lx-%lx\n", region.text_start, region.text_end);
dkprintf("data: %lx-%lx\n", region.data_start, region.data_end);
@ -364,7 +375,7 @@ int gencore(struct process *proc, void *regs,
goto fail;
}
memset(note, 0, alignednotesize);
fill_note(note, proc, regs);
fill_note(note, thread, regs);
/* prgram header for NOTE segment is exceptional */
ph[0].p_type = PT_NOTE;
@ -434,7 +445,7 @@ int gencore(struct process *proc, void *regs,
for (start = p = range->start;
p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
if (prevzero == 0) {
/* We begin a new chunk */
@ -472,9 +483,9 @@ int gencore(struct process *proc, void *regs,
i++;
}
} else {
if ((proc->vm->region.user_start <= range->start) &&
(range->end <= proc->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if ((thread->vm->region.user_start <= range->start) &&
(range->end <= thread->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)range->start, &phys) != 0) {
dkprintf("could not convert user virtual address %lx"
"to physical address", range->start);
@ -510,6 +521,10 @@ int gencore(struct process *proc, void *regs,
* \param coretable An array of core chunks.
*/
/*@
@ requires \valid(coretable);
@ assigns \nothing;
@*/
void freecore(struct coretable **coretable)
{
struct coretable *ct = *coretable;

View File

@ -0,0 +1,98 @@
/**
* \file arch-bitops.h
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#endif

View File

@ -0,0 +1,67 @@
/**
* \file futex.h
* Licence details are found in the file LICENSE.
*
* \brief
* Futex adaptation to McKernel
*
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
*
*
* HISTORY:
*
*/
#ifndef _ARCH_FUTEX_H
#define _ARCH_FUTEX_H
#include <asm.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -5,15 +5,20 @@
#define __HEADER_X86_COMMON_ARCH_LOCK
#include <ihk/cpu.h>
#include <ihk/atomic.h>
//#define DEBUG_SPINLOCK
//#define DEBUG_MCS_RWLOCK
#ifdef DEBUG_SPINLOCK
#if defined(DEBUG_SPINLOCK) || defined(DEBUG_MCS_RWLOCK)
int __kprintf(const char *format, ...);
#endif
typedef int ihk_spinlock_t;
extern void preempt_enable(void);
extern void preempt_disable(void);
#define IHK_STATIC_SPINLOCK_FUNCS
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
@ -22,7 +27,17 @@ static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
}
#define SPIN_LOCK_UNLOCKED 0
static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_lock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
#endif
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
@ -41,10 +56,8 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
: "+Q" (inc), "+m" (*lock), "=r" (tmp) : : "memory", "cc");
#endif
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] trying to grab lock: 0x%lX\n",
ihk_mc_get_processor_id(), lock);
#endif
preempt_disable();
asm volatile("lock; xaddl %0, %1\n"
"movzwl %w0, %2\n\t"
"shrl $16, %0\n\t"
@ -60,36 +73,455 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
:
: "memory", "cc");
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] holding lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
#endif
}
static unsigned long ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock(l) ({ unsigned long rc;\
__kprintf("[%d] call ihk_mc_spinlock_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
rc = __ihk_mc_spinlock_lock(l);\
__kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
})
#else
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
#endif
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
{
unsigned long flags;
flags = cpu_disable_interrupt_save();
ihk_mc_spinlock_lock_noirq(lock);
__ihk_mc_spinlock_lock_noirq(lock);
return flags;
}
static void ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
#endif
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
{
asm volatile ("lock incw %0" : "+m"(*lock) : : "memory", "cc");
preempt_enable();
}
static void ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock(l, f) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock((l), (f)); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
#endif
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
{
ihk_mc_spinlock_unlock_noirq(lock);
__ihk_mc_spinlock_unlock_noirq(lock);
cpu_restore_interrupt(flags);
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] released lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
}
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
} __attribute__((aligned(64))) mcs_lock_node_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;
node->next = NULL;
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
node->next = NULL;
node->locked = 0;
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
(unsigned long)node);
if (pred) {
node->locked = 1;
pred->next = node;
while (node->locked != 0) {
cpu_pause();
}
}
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
struct mcs_lock_node *old = (struct mcs_lock_node *)
atomic_cmpxchg8((unsigned long *)&lock->next,
(unsigned long)node, (unsigned long)0);
if (old == node) {
return;
}
while (node->next == NULL) {
cpu_pause();
}
}
node->next->locked = 0;
}
// reader/writer lock
typedef struct mcs_rwlock_node {
ihk_atomic_t count; // num of readers (use only common reader)
char type; // lock type
#define MCS_RWLOCK_TYPE_COMMON_READER 0
#define MCS_RWLOCK_TYPE_READER 1
#define MCS_RWLOCK_TYPE_WRITER 2
char locked; // lock
#define MCS_RWLOCK_LOCKED 1
#define MCS_RWLOCK_UNLOCKED 0
char dmy1; // unused
char dmy2; // unused
struct mcs_rwlock_node *next;
} __attribute__((aligned(64))) mcs_rwlock_node_t;
typedef struct mcs_rwlock_node_irqsave {
struct mcs_rwlock_node node;
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_rwlock_node_irqsave_t;
typedef struct mcs_rwlock_lock {
struct mcs_rwlock_node reader; /* common reader lock */
struct mcs_rwlock_node *node; /* base */
} __attribute__((aligned(64))) mcs_rwlock_lock_t;
static void
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
{
ihk_atomic_set(&lock->reader.count, 0);
lock->reader.type = MCS_RWLOCK_TYPE_COMMON_READER;
lock->node = NULL;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
#endif
static void
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_WRITER;
node->next = NULL;
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
}
static void
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
{
struct mcs_rwlock_node *p;
struct mcs_rwlock_node *f = NULL;
struct mcs_rwlock_node *n;
int breakf = 0;
ihk_atomic_inc(&lock->reader.count); // protect to unlock reader
for(p = &lock->reader; p->next; p = n){
n = p->next;
if(p->next->type == MCS_RWLOCK_TYPE_READER){
p->next = n->next;
if(lock->node == n){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)n,
(unsigned long)p);
if(old != n){ // couldn't change
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
else{
breakf = 1;
}
}
else if(p->next == NULL){
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
if(f){
ihk_atomic_inc(&lock->reader.count);
n->locked = MCS_RWLOCK_UNLOCKED;
}
else
f = n;
n = p;
if(breakf)
break;
}
if(n->next == NULL && lock->node != n){
while (n->next == NULL && lock->node != n) {
cpu_pause();
}
}
}
f->locked = MCS_RWLOCK_UNLOCKED;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
#endif
static void
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if (node->next == NULL) {
struct mcs_rwlock_node *old = (struct mcs_rwlock_node *)
atomic_cmpxchg8((unsigned long *)&lock->node,
(unsigned long)node, (unsigned long)0);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
}
if(node->next->type == MCS_RWLOCK_TYPE_READER){
lock->reader.next = node->next;
mcs_rwlock_unlock_readers(lock);
}
else{
node->next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq
#endif
static inline unsigned int
atomic_inc_ifnot0(ihk_atomic_t *v)
{
unsigned int *p = (unsigned int *)(&(v)->counter);
unsigned int old;
unsigned int new;
unsigned int val;
do{
if(!(old = *p))
break;
new = old + 1;
val = atomic_cmpxchg4(p, old, new);
}while(val != old);
return old;
}
static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_READER;
node->next = NULL;
node->dmy1 = ihk_mc_get_processor_id();
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
if(pred == &lock->reader){
if(atomic_inc_ifnot0(&pred->count)){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)node,
(unsigned long)pred);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
node->locked = MCS_RWLOCK_LOCKED;
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
ihk_atomic_dec(&pred->count);
goto out;
}
}
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
else {
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
}
out:
return;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
#endif
static void
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if(ihk_atomic_dec_return(&lock->reader.count))
goto out;
if (lock->reader.next == NULL) {
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)&lock->reader,
(unsigned long)0);
if (old == &lock->reader) {
goto out;
}
while (lock->reader.next == NULL) {
cpu_pause();
}
}
if(lock->reader.next->type == MCS_RWLOCK_TYPE_READER){
mcs_rwlock_unlock_readers(lock);
}
else{
lock->reader.next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
#endif
static void
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_writer_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
#endif
static void
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_writer_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
#endif
static void
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_reader_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
#endif
static void
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_reader_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#endif

View File

@ -22,6 +22,7 @@
#define USER_CS_ENTRY 6
#define USER_DS_ENTRY 7
#define GLOBAL_TSS_ENTRY 8
#define GETCPU_ENTRY 15
#define KERNEL_CS (KERNEL_CS_ENTRY * 8)
#define KERNEL_DS (KERNEL_DS_ENTRY * 8)
@ -40,10 +41,12 @@
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
#define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
@ -65,6 +68,8 @@
#define PF_PRESENT ((pte_t)0x01) /* entry is valid */
#define PF_WRITABLE ((pte_t)0x02)
#define PFLX_PWT ((pte_t)0x08)
#define PFLX_PCD ((pte_t)0x10)
#define PF_SIZE ((pte_t)0x80) /* entry points large page */
#define PFL4_PRESENT ((pte_t)0x01)
@ -74,8 +79,8 @@
#define PFL3_PRESENT ((pte_t)0x01)
#define PFL3_WRITABLE ((pte_t)0x02)
#define PFL3_USER ((pte_t)0x04)
#define PFL3_PWT ((pte_t)0x08)
#define PFL3_PCD ((pte_t)0x10)
#define PFL3_PWT PFLX_PWT
#define PFL3_PCD PFLX_PCD
#define PFL3_ACCESSED ((pte_t)0x20)
#define PFL3_DIRTY ((pte_t)0x40)
#define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */
@ -86,8 +91,8 @@
#define PFL2_PRESENT ((pte_t)0x01)
#define PFL2_WRITABLE ((pte_t)0x02)
#define PFL2_USER ((pte_t)0x04)
#define PFL2_PWT ((pte_t)0x08)
#define PFL2_PCD ((pte_t)0x10)
#define PFL2_PWT PFLX_PWT
#define PFL2_PCD PFLX_PCD
#define PFL2_ACCESSED ((pte_t)0x20)
#define PFL2_DIRTY ((pte_t)0x40)
#define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */
@ -98,8 +103,8 @@
#define PFL1_PRESENT ((pte_t)0x01)
#define PFL1_WRITABLE ((pte_t)0x02)
#define PFL1_USER ((pte_t)0x04)
#define PFL1_PWT ((pte_t)0x08)
#define PFL1_PCD ((pte_t)0x10)
#define PFL1_PWT PFLX_PWT
#define PFL1_PCD PFLX_PCD
#define PFL1_ACCESSED ((pte_t)0x20)
#define PFL1_DIRTY ((pte_t)0x40)
#define PFL1_IGNORED_11 ((pte_t)1 << 11)
@ -152,6 +157,8 @@ enum ihk_mc_pt_attribute {
PTATTR_WRITE_COMBINED = 0x40000,
};
enum ihk_mc_pt_attribute attr_mask;
static inline int pte_is_null(pte_t *ptep)
{
return (*ptep == PTE_NULL);
@ -207,6 +214,27 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize)
return (off_t)(*ptep & PAGE_MASK);
}
static inline enum ihk_mc_pt_attribute pte_get_attr(pte_t *ptep, size_t pgsize)
{
enum ihk_mc_pt_attribute attr;
attr = *ptep & attr_mask;
if (*ptep & PFLX_PWT) {
if (*ptep & PFLX_PCD) {
attr |= PTATTR_UNCACHABLE;
}
else {
attr |= PTATTR_WRITE_COMBINED;
}
}
if (((pgsize == PTL2_SIZE) && (*ptep & PFL2_SIZE))
|| ((pgsize == PTL3_SIZE) && (*ptep & PFL3_SIZE))) {
attr |= PTATTR_LARGEPAGE;
}
return attr;
} /* pte_get_attr() */
static inline void pte_make_null(pte_t *ptep, size_t pgsize)
{
*ptep = PTE_NULL;
@ -278,7 +306,7 @@ struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);
@ -290,5 +318,5 @@ extern unsigned long ap_trampoline;
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif

View File

@ -0,0 +1,18 @@
/**
* \file auxvec.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent constants for auxiliary vector
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_AUXVEC_H
#define ARCH_AUXVEC_H
#define AT_SYSINFO_EHDR 33
#endif

View File

@ -0,0 +1,37 @@
/**
* \file cpu.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent types and functions to control CPU.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_CPU_H
#define ARCH_CPU_H
#include <ihk/cpu.h>
static inline void rmb(void)
{
barrier();
}
static inline void wmb(void)
{
barrier();
}
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
#endif /* ARCH_CPU_H */

View File

@ -0,0 +1,16 @@
#ifndef __ARCH_MM_H
#define __ARCH_MM_H
struct process_vm;
static inline void
flush_nfo_tlb()
{
}
static inline void
flush_nfo_tlb_mm(struct process_vm *vm)
{
}
#endif

View File

@ -27,6 +27,10 @@
#define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
/*
* for mlockall()
*/

View File

@ -13,6 +13,11 @@
#ifndef HEADER_ARCH_SHM_H
#define HEADER_ARCH_SHM_H
/* shmflg */
#define SHM_HUGE_SHIFT 26
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
struct ipc_perm {
key_t key;
uid_t uid;
@ -34,7 +39,8 @@ struct shmid_ds {
pid_t shm_cpid;
pid_t shm_lpid;
uint64_t shm_nattch;
uint8_t padding[16];
uint8_t padding[12];
int init_pgshift;
};
#endif /* HEADER_ARCH_SHM_H */

View File

@ -22,7 +22,7 @@
* - 4096 : kernel stack
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_CPU_LOCAL_OFFSET_KSTACK 16
#define X86_CPU_LOCAL_OFFSET_USTACK 24
@ -39,13 +39,13 @@ struct x86_cpu_local_variables {
struct x86_desc_ptr gdt_ptr;
unsigned short pad[3];
/* 48 */
uint64_t gdt[10];
/* 128 */
uint64_t gdt[16];
/* 176 */
struct tss64 tss;
/* 232 */
/* 280 */
unsigned long paniced;
uint64_t panic_regs[21];
/* 408 */
/* 456 */
} __attribute__((packed));
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id);

View File

@ -1,40 +1,7 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#ifndef _ERRNO_BASE_H
#define _ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#include <generic-errno.h>
#define EDEADLK 35 /* Resource deadlock would occur */
#define ENAMETOOLONG 36 /* File name too long */
@ -141,29 +108,4 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif

View File

@ -13,6 +13,10 @@
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
#define HEADER_X86_COMMON_IHK_ATOMIC_H
/***********************************************************************
* ihk_atomic_t
*/
typedef struct {
int counter;
} ihk_atomic_t;
@ -95,6 +99,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define ihk_atomic_inc_return(v) (ihk_atomic_add_return(1, v))
#define ihk_atomic_dec_return(v) (ihk_atomic_sub_return(1, v))
/***********************************************************************
* ihk_atomic64_t
*/
typedef struct {
long counter64;
} ihk_atomic64_t;
#define IHK_ATOMIC64_INIT(i) { .counter64 = (i) }
static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
{
return *(volatile long *)&(v)->counter64;
}
static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
{
asm volatile ("lock incq %0" : "+m"(v->counter64));
}
/***********************************************************************
* others
*/
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
* Note 2: xchg has side effect, so that attribute volatile is necessary,
@ -112,6 +140,17 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
__x; \
})
static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
{
unsigned long __x = (x);
asm volatile("xchgq %0,%1"
: "=r" (__x)
: "m" (*(volatile unsigned long*)(ptr)), "0" (__x)
: "memory");
return __x;
}
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
@ -150,5 +189,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define xchg(ptr, v) \
__xchg((v), (ptr), sizeof(*ptr))
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
unsigned long oldval,
unsigned long newval)
{
asm volatile("lock; cmpxchgq %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
unsigned int oldval,
unsigned int newval)
{
asm volatile("lock; cmpxchgl %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -31,9 +31,5 @@ typedef int64_t off_t;
#define NULL ((void *)0)
#define BITS_PER_LONG_SHIFT 6
#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT)
#endif

View File

@ -0,0 +1,17 @@
/**
* \file prctl.h
* License details are found in the file LICENSE.
*/
/*
* HISTORY
*/
#ifndef __ARCH_PRCTL_H
#define __ARCH_PRCTL_H
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#endif

View File

@ -58,6 +58,7 @@
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
#define MSR_IA32_CR_PAT 0x00000277
#define MSR_IA32_XSS 0xda0
#define CVAL(event, mask) \

View File

@ -90,10 +90,6 @@ enum __rlimit_resource
#define RLIM_NLIMITS __RLIM_NLIMITS
};
struct rlimit {
uint64_t rlim_cur; /* Soft limit */
uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#include <generic-rlimit.h>
#endif

View File

@ -20,12 +20,13 @@
* syscall_name[] only, no handler exists.
*/
SYSCALL_DELEGATED(0, read)
SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open)
SYSCALL_HANDLED(3, close)
SYSCALL_DELEGATED(4, stat)
SYSCALL_DELEGATED(5, fstat)
SYSCALL_DELEGATED(7, poll)
SYSCALL_DELEGATED(8, lseek)
SYSCALL_HANDLED(9, mmap)
SYSCALL_HANDLED(10, mprotect)
@ -34,11 +35,12 @@ SYSCALL_HANDLED(12, brk)
SYSCALL_HANDLED(13, rt_sigaction)
SYSCALL_HANDLED(14, rt_sigprocmask)
SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_DELEGATED(16, ioctl)
SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev)
SYSCALL_DELEGATED(21, access)
SYSCALL_DELEGATED(23, select)
SYSCALL_HANDLED(24, sched_yield)
SYSCALL_HANDLED(25, mremap)
SYSCALL_HANDLED(26, msync)
@ -48,6 +50,9 @@ SYSCALL_HANDLED(29, shmget)
SYSCALL_HANDLED(30, shmat)
SYSCALL_HANDLED(31, shmctl)
SYSCALL_HANDLED(34, pause)
SYSCALL_HANDLED(35, nanosleep)
SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork)
@ -57,12 +62,17 @@ SYSCALL_HANDLED(60, exit)
SYSCALL_HANDLED(61, wait4)
SYSCALL_HANDLED(62, kill)
SYSCALL_DELEGATED(63, uname)
SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_HANDLED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_DELEGATED(96, gettimeofday)
SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace)
SYSCALL_HANDLED(102, getuid)
SYSCALL_HANDLED(104, getgid)
@ -99,25 +109,36 @@ SYSCALL_HANDLED(151, mlockall)
SYSCALL_HANDLED(152, munlockall)
SYSCALL_HANDLED(158, arch_prctl)
SYSCALL_HANDLED(160, setrlimit)
SYSCALL_HANDLED(164, settimeofday)
SYSCALL_HANDLED(186, gettid)
SYSCALL_HANDLED(200, tkill)
SYSCALL_DELEGATED(201, time)
SYSCALL_HANDLED(202, futex)
SYSCALL_HANDLED(203, sched_setaffinity)
SYSCALL_HANDLED(204, sched_getaffinity)
SYSCALL_DELEGATED(208, io_getevents)
SYSCALL_HANDLED(216, remap_file_pages)
SYSCALL_DELEGATED(217, getdents64)
SYSCALL_HANDLED(218, set_tid_address)
SYSCALL_DELEGATED(220, semtimedop)
SYSCALL_HANDLED(228, clock_gettime)
SYSCALL_DELEGATED(230, clock_nanosleep)
SYSCALL_HANDLED(231, exit_group)
SYSCALL_DELEGATED(232, epoll_wait)
SYSCALL_HANDLED(234, tgkill)
SYSCALL_HANDLED(237, mbind)
SYSCALL_HANDLED(238, set_mempolicy)
SYSCALL_HANDLED(239, get_mempolicy)
SYSCALL_HANDLED(247, waitid)
SYSCALL_HANDLED(256, migrate_pages)
SYSCALL_DELEGATED(270, pselect6)
SYSCALL_DELEGATED(271, ppoll)
SYSCALL_HANDLED(273, set_robust_list)
SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4)
SYSCALL_HANDLED(298, perf_event_open)
#ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call)
#endif

View File

@ -13,7 +13,7 @@
* 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)
@ -206,9 +206,12 @@ x86_syscall:
.globl enter_user_mode
enter_user_mode:
callq release_runq_lock
movq $0, %rdi
movq %rsp, %rsi
call check_signal
call check_signal
movq $0, %rdi
call set_cputime
POP_ALL_REGS
addq $8, %rsp
iretq

View File

@ -38,6 +38,11 @@ void init_processors_local(int max_id)
kprintf("locals = %p\n", locals);
}
/*@
@ requires \valid(id);
@ ensures \result == locals + (LOCALS_SPAN * id);
@ assigns \nothing;
@*/
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id)
{
return (struct x86_cpu_local_variables *)
@ -98,6 +103,10 @@ void init_boot_processor_local(void)
}
/** IHK **/
/*@
@ ensures \result == %gs;
@ assigns \nothing;
*/
int ihk_mc_get_processor_id(void)
{
int id;
@ -107,6 +116,10 @@ int ihk_mc_get_processor_id(void)
return id;
}
/*@
@ ensures \result == (locals + (LOCALS_SPAN * %gs))->apic_id;
@ assigns \nothing;
*/
int ihk_mc_get_hardware_processor_id(void)
{
struct x86_cpu_local_variables *v = get_x86_this_cpu_local();

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@ -23,22 +24,28 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler);
ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel);
/* Set boot parameter */

View File

@ -12,19 +12,72 @@
#include <errno.h>
#include <ihk/debug.h>
#include <registers.h>
#include <mc_perf_event.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
void x86_init_perfctr(void)
{
int i = 0;
unsigned long reg;
unsigned long value = 0;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
/* Do not do it on KVM */
if (running_on_kvm()) return;
/* Allow PMC to be read from user space */
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Detect number of supported performance counters */
if (!perf_counters_discovered) {
/* See Table 35.2 - Architectural MSRs in Vol 3C */
op = 0x0a;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
X86_IA32_FIXED_PERF_COUNTERS_MASK =
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
X86_IA32_BASE_FIXED_PERF_COUNTERS;
perf_counters_discovered = 1;
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
}
/* Clear Fixed Counter Control */
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= 0xfffffffffffff000L;
wrmsr(MSR_PERF_FIXED_CTRL, value);
/* Clear Generic Counter Control */
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -33,20 +86,53 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
return -EINVAL;
}
if (mode & PERFCTR_USER_MODE) {
// clear mode flags
value &= ~(3 << 16);
// set mode flags
if(mode & PERFCTR_USER_MODE) {
value |= 1 << 16;
}
if (mode & PERFCTR_KERNEL_MODE) {
}
if(mode & PERFCTR_KERNEL_MODE) {
value |= 1 << 17;
}
}
// wrmsr(MSR_PERF_GLOBAL_CTRL, 0);
value |= (1 << 22) | (1 << 18); /* EN */
value |= (1 << 20); /* Enable overflow interrupt */
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, long val)
{
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
val &= 0x000000ffffffffff; // 40bit Mask
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
wrmsr(MSR_IA32_PMC0 + counter, val);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// set fixed pmc
wrmsr(MSR_IA32_FIXED_CTR0 + counter - X86_IA32_BASE_FIXED_PERF_COUNTERS, val);
}
else {
return -EINVAL;
}
return 0;
}
@ -57,6 +143,45 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
CVAL2(event, mask, inv, count));
}
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, code);
}
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
@ -78,14 +203,15 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
unsigned int value = 0;
unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
#ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask);
#endif
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask;
value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0;
@ -93,25 +219,78 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
int ihk_mc_perfctr_stop(unsigned long counter_mask)
{
unsigned int value;
unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
if(counter_mask >> 32 & 0x1) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x2) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 4);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x4) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
}
// init for fixed counter
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
// enable PMI on overflow
set_val |= 1 << 3;
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_reset(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_pmc_x86_direct(counter, 0);
}
wrmsr(MSR_IA32_PMC0 + counter, 0);
return 0;
int ihk_mc_perfctr_set(int counter, long val)
{
return set_pmc_x86_direct(counter, val);
}
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
@ -129,10 +308,87 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
unsigned long ihk_mc_perfctr_read(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
return rdpmc(counter);
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
retval = rdpmc(counter);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
retval = rdpmc((1 << 30) + (counter - X86_IA32_BASE_FIXED_PERF_COUNTERS));
}
else {
retval = -EINVAL;
}
return retval;
}
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
unsigned int idx = 0;
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
idx = MSR_IA32_PMC0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
idx = MSR_IA32_FIXED_CTR0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else {
retval = -EINVAL;
}
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
{
int ret = -1;
int i = 0;
if(*type == PERF_TYPE_HARDWARE) {
switch(*config){
case PERF_COUNT_HW_INSTRUCTIONS :
*type = PERF_TYPE_RAW;
*config = 0x5300c0;
break;
default :
// Unexpected config
return -1;
}
}
else if(*type != PERF_TYPE_RAW) {
return -1;
}
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
break;
}
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -18,20 +18,93 @@
*/
#include <syscall.h>
#include <ihk/atomic.h>
#include <arch/cpu.h>
extern int vsyscall_gettimeofday(void *tv, void *tz)
extern int vsyscall_gettimeofday(struct timeval *tv, void *tz)
__attribute__ ((section (".vsyscall.gettimeofday")));
int vsyscall_gettimeofday(void *tv, void *tz)
struct tod_data_s tod_data
__attribute__ ((section(".vsyscall.gettimeofday.data"))) = {
.do_local = 0,
.version = IHK_ATOMIC64_INIT(0),
};
static inline void cpu_pause_for_vsyscall(void)
{
asm volatile ("pause" ::: "memory");
return;
} /* cpu_pause_for_vsyscall() */
static inline void calculate_time_from_tsc(struct timespec *ts)
{
long ver;
unsigned long current_tsc;
__time_t sec_delta;
long ns_delta;
for (;;) {
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
/* settimeofday() is in progress */
cpu_pause_for_vsyscall();
}
rmb();
*ts = tod_data.origin;
rmb();
if (ver == ihk_atomic64_read(&tod_data.version)) {
break;
}
/* settimeofday() has intervened */
cpu_pause_for_vsyscall();
}
current_tsc = rdtsc();
sec_delta = current_tsc / tod_data.clocks_per_sec;
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
ts->tv_sec += sec_delta;
ts->tv_nsec += ns_delta;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
return;
} /* calculate_time_from_tsc() */
int vsyscall_gettimeofday(struct timeval *tv, void *tz)
{
int error;
struct timespec ats;
if (!tv && !tz) {
/* nothing to do */
return 0;
}
/* Do it locally if supported */
if (!tz && tod_data.do_local) {
calculate_time_from_tsc(&ats);
tv->tv_sec = ats.tv_sec;
tv->tv_usec = ats.tv_nsec / 1000;
return 0;
}
/* Otherwise syscall */
asm ("syscall" : "=a" (error)
: "a" (__NR_gettimeofday), "D" (tv), "S" (tz)
: "%rcx", "%r11", "memory");
if (error) {
*(int *)0 = 0; /* i.e. raise(SIGSEGV) */
}
return error;
}
} /* vsyscall_gettimeofday() */
extern long vsyscall_time(void *tp)
__attribute__ ((section (".vsyscall.time")));

View File

@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -3,77 +3,461 @@
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value" >&2
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
;;
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl || echo "warning: failed to remove mcctrl" >&2
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "error: removing mcoverlay" >&2
error_exit "initial"
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
error_exit "initial"
fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
if ! insmod ${KMODDIR}/ihk.ko; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep $i`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq; then echo "error: loading ihk-smp-x86"; exit; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "cpus_reserved"
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then
echo "error: inserting mcctrl.ko" >&2
error_exit "mem_reserved"
fi
fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs hidos; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi

View File

@ -0,0 +1,115 @@
#!/bin/bash
# IHK SMP-x86 example McKernel unload script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2015 RIKEN AICS
#
# This is an example script for destroying McKernel and releasing IHK resources
# Note that the script does no output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi

2483
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,27 @@ AC_ARG_WITH([target],
[--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]),
[WITH_TARGET=$withval],[WITH_TARGET=yes])
AC_ARG_WITH([system_map],
AS_HELP_STRING(
[--with-system_map=path],[Path to 'System.map file', default is /boot/System.map-uname_r]),
[WITH_SYSTEM_MAP=$withval],[WITH_SYSTEM_MAP=yes])
AC_ARG_ENABLE([dcfa],
[AS_HELP_STRING(
[--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no])
AC_ARG_ENABLE([memdump],
AC_HELP_STRING([--enable-memdump],
[enable dumping memory and analyzing a dump]),
[ENABLE_MEMDUMP=$enableval],
[ENABLE_MEMDUMP=default])
AC_ARG_ENABLE([mcoverlayfs],
AC_HELP_STRING([--enable-mcoverlayfs],
[enable mcoverlayfs implementation]),
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
case "X$WITH_KERNELSRC" in
Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@ -49,9 +66,26 @@ fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
case $WITH_TARGET in
attached-mic)
attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
esac
case $WITH_TARGET in
attached-mic)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -69,12 +103,6 @@ case $WITH_TARGET in
fi
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -92,9 +120,6 @@ case $WITH_TARGET in
fi
;;
builtin-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -112,9 +137,6 @@ case $WITH_TARGET in
fi
;;
smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/smp-x86/kernel"
fi
@ -124,6 +146,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -139,6 +164,116 @@ esac
KDIR="$WITH_KERNELSRC"
TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
case "X$WITH_SYSTEM_MAP" in
Xyes | Xno | X)
MCCTRL_LINUX_SYMTAB=""
;;
*)
MCCTRL_LINUX_SYMTAB="$WITH_SYSTEM_MAP"
;;
esac
AC_MSG_CHECKING([[for System.map]])
if test -f "$MCCTRL_LINUX_SYMTAB"; then
MCCTRL_LINUX_SYMTAB="$MCCTRL_LINUX_SYMTAB"
elif test -f "/boot/System.map-`uname -r`"; then
MCCTRL_LINUX_SYMTAB="/boot/System.map-`uname -r`"
elif test -f "$KDIR/System.map"; then
MCCTRL_LINUX_SYMTAB="$KDIR/System.map"
fi
if test "$MCCTRL_LINUX_SYMTAB" == ""; then
AC_MSG_ERROR([could not find])
fi
if test -z "`eval cat $MCCTRL_LINUX_SYMTAB`"; then
AC_MSG_ERROR([could not read System.map file, no read permission?])
fi
AC_MSG_RESULT([$MCCTRL_LINUX_SYMTAB])
MCCTRL_LINUX_SYMTAB_CMD="cat $MCCTRL_LINUX_SYMTAB"
# MCCTRL_FIND_KSYM(SYMBOL)
# ------------------------------------------------------
# Search System.map for address of the given symbol and
# do one of three things in config.h:
# If not found, leave MCCTRL_KSYM_foo undefined
# If found to be exported, "#define MCCTRL_KSYM_foo 0"
# If found not to be exported, "#define MCCTRL_KSYM_foo 0x<value>"
AC_DEFUN([MCCTRL_FIND_KSYM],[
AC_MSG_CHECKING([[System.map for symbol $1]])
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " $1\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
AC_MSG_RESULT([not found])
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
m4_ifval([$2],[],[
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_$1\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
])
AC_MSG_RESULT([$mcctrl_result])
AC_DEFINE_UNQUOTED(MCCTRL_KSYM_[]$1,$mcctrl_addr,[Define to address of kernel symbol $1, or 0 if exported])
fi
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
MCCTRL_FIND_KSYM([vdso_start])
MCCTRL_FIND_KSYM([vdso_end])
MCCTRL_FIND_KSYM([vdso_pages])
MCCTRL_FIND_KSYM([__vvar_page])
MCCTRL_FIND_KSYM([hpet_address])
MCCTRL_FIND_KSYM([hv_clock])
MCCTRL_FIND_KSYM([sys_readlink])
case $ENABLE_MEMDUMP in
yes|no|auto)
;;
default)
if test "x$WITH_TARGET" = "xsmp-x86" ; then
ENABLE_MEMDUMP=auto
else
ENABLE_MEMDUMP=no
fi
;;
*)
AC_MSG_ERROR([unknown memdump argument: $ENABLE_MEMDUMP])
;;
esac
if test "x$ENABLE_MEMDUMP" != "xno" ; then
enableval=yes
AC_CHECK_LIB([bfd],[bfd_init],[],[enableval=no])
AC_CHECK_HEADER([bfd.h],[],[enableval=no])
if test "x$ENABLE_MEMDUMP" = "xyes" -a "x$enableval" = "xno" ; then
AC_MSG_ERROR([memdump feature needs bfd.h and libbfd a.k.a bunutils-devel])
fi
ENABLE_MEMDUMP=$enableval
fi
if test "x$ENABLE_MEMDUMP" = "xyes" ; then
AC_MSG_NOTICE([memdump feature is enabled])
AC_DEFINE([ENABLE_MEMDUMP],[1],[whether memdump feature is enabled])
uncomment_if_ENABLE_MEMDUMP=''
else
AC_MSG_NOTICE([memdump feature is disabled])
uncomment_if_ENABLE_MEMDUMP='#'
fi
if test "x$ENABLE_MCOVERLAYFS" = "xyes" ; then
AC_DEFINE([ENABLE_MCOVERLAYFS],[1],[whether mcoverlayfs is enabled])
AC_MSG_NOTICE([mcoverlayfs is enabled])
else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
@ -146,9 +281,11 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION)
@ -156,19 +293,28 @@ AC_SUBST(DCFA_VERSION)
AC_SUBST(IHK_RELEASE_DATE)
AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_FILES([
Makefile
executer/user/Makefile
executer/kernel/Makefile
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
arch/x86/tools/mcshutdown-attached-mic.sh
arch/x86/tools/mcreboot-builtin-x86.sh
arch/x86/tools/mcreboot-smp-x86.sh
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
])
AS_IF([test "x$enable_dcfa" = xyes], [

91
executer/config.h.in Normal file
View File

@ -0,0 +1,91 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bfd' library (-lbfd). */
#undef HAVE_LIBBFD
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to address of kernel symbol __vvar_page, or 0 if exported */
#undef MCCTRL_KSYM___vvar_page
/* Define to address of kernel symbol hpet_address, or 0 if exported */
#undef MCCTRL_KSYM_hpet_address
/* Define to address of kernel symbol hv_clock, or 0 if exported */
#undef MCCTRL_KSYM_hv_clock
/* Define to address of kernel symbol sys_mount, or 0 if exported */
#undef MCCTRL_KSYM_sys_mount
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
/* Define to address of kernel symbol vdso_end, or 0 if exported */
#undef MCCTRL_KSYM_vdso_end
/* Define to address of kernel symbol vdso_image_64, or 0 if exported */
#undef MCCTRL_KSYM_vdso_image_64
/* Define to address of kernel symbol vdso_pages, or 0 if exported */
#undef MCCTRL_KSYM_vdso_pages
/* Define to address of kernel symbol vdso_start, or 0 if exported */
#undef MCCTRL_KSYM_vdso_start
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

View File

@ -48,6 +48,9 @@
#define MCEXEC_UP_OPEN_EXEC 0x30a02912
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_DEBUG_LOG 0x40000000
#define MCEXEC_UP_TRANSFER_TO_REMOTE 0
@ -83,6 +86,9 @@ struct program_load_desc {
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -104,6 +110,13 @@ struct program_load_desc {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
@ -122,8 +135,17 @@ struct syscall_load_desc {
unsigned long size;
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -166,4 +188,16 @@ struct newprocess_desc {
int pid;
};
struct sys_mount_desc {
char *dev_name;
char *dir_name;
char *type;
unsigned long flags;
void *data;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
#endif

View File

@ -1,26 +0,0 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/include -I$(src)/../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\"
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -1,191 +0,0 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <ihk/ihk_host_driver.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
int pid;
int req;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
};
struct mcctrl_per_proc_data {
struct list_head list;
int pid;
unsigned long rpgtable; /* per process, not per OS */
};
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
void **keys;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
#define PROCFS_NAME_MAX 1000
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
#endif

View File

@ -0,0 +1,27 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1 @@
# dummy file

View File

@ -0,0 +1,192 @@
#include <linux/version.h>
#include "../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
#if MCCTRL_KSYM_vdso_image_64
struct vdso_image *vdso_image = (void *)MCCTRL_KSYM_vdso_image_64;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_start
#if MCCTRL_KSYM_vdso_start
void *vdso_start = (void *)MCCTRL_KSYM_vdso_start;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_end
#if MCCTRL_KSYM_vdso_end
void *vdso_end = (void *)MCCTRL_KSYM_vdso_end;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_pages
#if MCCTRL_KSYM_vdso_pages
struct page **vdso_pages = (void *)MCCTRL_KSYM_vdso_pages;
#endif
#endif
#ifdef MCCTRL_KSYM___vvar_page
#if MCCTRL_KSYM___vvar_page
void *__vvar_page = (void *)MCCTRL_KSYM___vvar_page;
#endif
#endif
long *hpet_addressp
#ifdef MCCTRL_KSYM_hpet_address
#if MCCTRL_KSYM_hpet_address
= (void *)MCCTRL_KSYM_hpet_address;
#else
= &hpet_address;
#endif
#else
= NULL;
#endif
void **hv_clockp
#ifdef MCCTRL_KSYM_hv_clock
#if MCCTRL_KSYM_hv_clock
= (void *)MCCTRL_KSYM_hv_clock;
#else
= &hv_clock;
#endif
#else
= NULL;
#endif
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
int
reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
{
struct vm_area_struct *vma;
unsigned long start = 0L;
unsigned long end;
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
start = reserve_user_space_common(usrdata, start, end);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
if (IS_ERR_VALUE(start)) {
return start;
}
*startp = start;
*endp = end;
return 0;
}
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
long vdso_pa;
struct vdso *vdso;
size_t size;
int i;
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = virt_to_phys(
vdso_image->data + (i * PAGE_SIZE));
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
size = vdso_end - vdso_start;
size = (size + PAGE_SIZE - 1) & PAGE_MASK;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]);
}
#endif
/* VVAR page */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-3 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-2 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
vdso->vvar_is_global = 1;
vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#endif
/* HPET page */
if (hpet_addressp && *hpet_addressp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-2 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-1 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
vdso->hpet_is_global = 1;
vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET);
vdso->hpet_phys = *hpet_addressp;
#endif
}
/* struct pvlock_vcpu_time_info table */
if (hv_clockp && *hv_clockp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->pvti_is_global = 0;
vdso->pvti_virt = (void *)(-1 * PAGE_SIZE);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
vdso->pvti_is_global = 1;
vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#endif
}
out:
wmb();
vdso->busy = 0;
ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso));
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */

View File

@ -45,7 +45,6 @@ static int load_elf(struct linux_binprm *bprm
#endif
)
{
char mcexec[BINPRM_BUF_SIZE];
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
const
#endif
@ -60,12 +59,8 @@ static int load_elf(struct linux_binprm *bprm
int l;
} envdata;
envdata env[] = {
{.name = "MCEXEC"},
#define env_mcexec (env[0].val)
{.name = "MCEXEC_WL"},
#define env_mcexec_wl (env[1].val)
{.name = "MCEXEC_BL"},
#define env_mcexec_bl (env[2].val)
#define env_mcexec_wl (env[0].val)
{.name = NULL}
};
envdata *ep;
@ -80,7 +75,11 @@ static int load_elf(struct linux_binprm *bprm
char buf[32];
int l;
int pass;
char *pbuf;
const char *path;
if(bprm->envc == 0)
return -ENOEXEC;
if(memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
return -ENOEXEC;
if(elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
@ -89,18 +88,29 @@ static int load_elf(struct linux_binprm *bprm
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
cp = strrchr(bprm->interp, '/');
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
cp = strrchr(path, '/');
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
for(pass = 0; pass < 2; pass++){
p = bprm->p;
mode = cnt[0] == 0? (cnt[1] == 0? 2: 1): 0;
mode = cnt[0] == 0? 1: 0;
if(pass == 1){
for(ep = env; ep->name; ep++){
if(ep->l)
@ -112,11 +122,19 @@ static int load_elf(struct linux_binprm *bprm
for(i = 0, st = 0; mode != 2;){
if(st == 0){
off = p & ~PAGE_MASK;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
rc = get_user_pages_remote(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#else
rc = get_user_pages(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
if(rc <= 0)
#endif
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
@ -182,68 +200,52 @@ static int load_elf(struct linux_binprm *bprm
}
}
if(!env_mcexec || !strcmp(env_mcexec, "0") || !strcmp(env_mcexec, "off"))
rc = 1;
else{
rc = 0;
if(strchr(env_mcexec, '/') && strlen(env_mcexec) < BINPRM_BUF_SIZE)
strcpy(mcexec, env_mcexec);
else
strcpy(mcexec, MCEXEC_PATH);
}
if(rc);
else if(env_mcexec_wl)
rc = !pathcheck(bprm->interp, env_mcexec_wl);
else if(env_mcexec_bl)
rc = pathcheck(bprm->interp, env_mcexec_bl);
if(env_mcexec_wl)
rc = !pathcheck(path, env_mcexec_wl);
else
rc = pathcheck(bprm->interp, "/usr:/bin:/sbin:/opt");
rc = 1;
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(mcexec);
if (IS_ERR(file))
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
wp = mcexec;
wp = MCEXEC_PATH;
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
#if 1
rc = bprm_change_interp(mcexec, bprm);
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
#else
if(brpm->interp != bprm->filename)
kfree(brpm->interp);
kfree(brpm->filename);
bprm->filename = bprm->interp = kstrdup(mcexec, GFP_KERNEL);
if(!bprm->interp){
fput(file);
return -ENOMEM;
}
#endif
allow_write_access(bprm->file);
fput(bprm->file);
@ -251,8 +253,12 @@ static int load_elf(struct linux_binprm *bprm
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs
@ -270,7 +276,7 @@ void __init binfmt_mcexec_init(void)
insert_binfmt(&mcexec_format);
}
void __exit binfmt_mcexec_exit(void)
void binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

View File

@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h"
#define OS_MAX_MINOR 64
@ -67,6 +68,8 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
@ -79,71 +82,118 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
static int __init mcctrl_init(void)
ihk_os_t osnum_to_os(int n)
{
return os[n];
}
/* OS event notifier implementation */
int mcctrl_os_boot_notifier(int os_index)
{
int i;
int rc;
rc = -ENOENT;
for(i = 0; i < OS_MAX_MINOR; i++){
os[i] = ihk_host_find_os(i, NULL);
if (os[i]) {
printk("OS #%d found.\n", i);
rc = 0;
}
}
if(rc){
printk("OS not found.\n");
return rc;
os[os_index] = ihk_host_find_os(os_index, NULL);
if (!os[os_index]) {
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
return -EINVAL;
}
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
if (prepare_ikc_channels(os[i]) != 0) {
printk("Preparing syscall channels failed.\n");
os[i] = NULL;
}
}
if (prepare_ikc_channels(os[os_index]) != 0) {
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
os[os_index] = NULL;
return -EFAULT;
}
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
procfs_init(os_index);
printk("mcctrl: OS ID %d boot event handled\n", os_index);
return 0;
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
os[os_index] = NULL;
return rc;
}
int mcctrl_os_shutdown_notifier(int os_index)
{
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
}
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
.boot = mcctrl_os_boot_notifier,
.shutdown = mcctrl_os_shutdown_notifier,
};
static struct ihk_os_notifier mcctrl_os_notifier = {
.ops = &mcctrl_os_notifier_ops,
};
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
rus_page_hash_init();
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
if(rc < 0){
destroy_ikc_channels(os[i]);
os[i] = NULL;
}
procfs_init(i);
}
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();
return 0;
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
}
printk("mcctrl: initialized successfully.\n");
return ret;
error:
binfmt_mcexec_exit();
rus_page_hash_put_pages();
return ret;
}
static void __exit mcctrl_exit(void)
{
int i;
binfmt_mcexec_exit();
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
printk("mcctrl: warning: failed to deregister OS notifier??\n");
}
binfmt_mcexec_exit();
rus_page_hash_put_pages();
printk("mcctrl: unregistered.\n");
}
MODULE_LICENSE("GPL v2");

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h>
@ -34,25 +35,34 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg);
void procfs_delete(void *__os, int osnum, unsigned long arg);
void procfs_answer(unsigned long arg, int err);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) {
switch (msg) {
case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break;
@ -66,15 +76,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_CREATE:
procfs_create(__os, pisp->ref, pisp->osnum, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_DELETE:
procfs_delete(__os, pisp->osnum, pisp->arg);
mcexec_syscall(usrdata, pisp);
break;
case SCD_MSG_PROCFS_ANSWER:
@ -84,6 +86,43 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_SEND_SIGNAL:
sig_done(pisp->arg, pisp->err);
break;
case SCD_MSG_SYSFS_REQ_CREATE:
case SCD_MSG_SYSFS_REQ_MKDIR:
case SCD_MSG_SYSFS_REQ_SYMLINK:
case SCD_MSG_SYSFS_REQ_LOOKUP:
case SCD_MSG_SYSFS_REQ_UNLINK:
case SCD_MSG_SYSFS_REQ_SETUP:
case SCD_MSG_SYSFS_RESP_SHOW:
case SCD_MSG_SYSFS_RESP_STORE:
case SCD_MSG_SYSFS_RESP_RELEASE:
sysfsm_packet_handler(__os, pisp->msg, pisp->err,
pisp->sysfs_arg1, pisp->sysfs_arg2);
break;
case SCD_MSG_PROCFS_TID_CREATE:
case SCD_MSG_PROCFS_TID_DELETE:
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
pisp->msg, pisp->ref, pisp->osnum, pisp->pid,
pisp->err, pisp->arg);
break;
}
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0;
}
@ -121,8 +160,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0;
}
@ -137,91 +174,26 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
@ -240,11 +212,8 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
}
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -261,11 +230,8 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -288,27 +254,29 @@ static struct ihk_ikc_listen_param listen_param2 = {
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
int error;
struct mcctrl_usrdata *usrdata;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("Error: cannot obtain OS CPU and memory information.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
if (usrdata->cpu_info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->num_channels = usrdata->cpu_info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
@ -322,33 +290,20 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
return 0;
}
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@ -356,6 +311,11 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
@ -366,7 +326,6 @@ void destroy_ikc_channels(ihk_os_t os)
printk("Channel #%d freed.\n", i);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
kfree(usrdata->channels);
kfree(usrdata);

View File

@ -0,0 +1,388 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <linux/fs.h>
#include <ihk/ihk_host_driver.h>
#include <linux/resource.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req;
struct ikc_scd_packet *packet;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct ikc_scd_init_param init;
void *dma_buf;
};
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
};
struct sysfsm_req {
int busy;
int padding;
long lresult;
wait_queue_head_t wq;
};
struct sysfsm_data {
size_t sysfs_bufsize;
void *sysfs_buf;
long sysfs_buf_rpa;
long sysfs_buf_pa;
struct kobject *sysfs_kobj;
struct sysfsm_node *sysfs_root;
struct semaphore sysfs_tree_sem;
struct semaphore sysfs_io_sem;
struct sysfsm_req sysfs_req;
ihk_os_t sysfs_os;
};
static inline int sysfs_inited(struct sysfsm_data *sdp)
{
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
struct list_head chain;
};
struct cpu_topology {
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
struct list_head chain;
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);
void delete_pid_entry(int osnum, int pid);
void proc_exe_link(int osnum, int pid, const char *path);
void procfs_init(int osnum);
void procfs_exit(int osnum);
/* sysfs_files.c */
void setup_sysfs_files(ihk_os_t os);
void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
void get_vdso_info(ihk_os_t os, long vdso_pa);
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
wait_queue_head_t wq;
};
#endif

View File

@ -0,0 +1,838 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include <linux/version.h>
#include <linux/semaphore.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
typedef uid_t kuid_t;
typedef gid_t kgid_t;
#endif
struct procfs_entry {
char *name;
mode_t mode;
const struct file_operations *fops;
};
#define NOD(NAME, MODE, FOP) { \
.name = (NAME), \
.mode = MODE, \
.fops = FOP, \
}
#define PROC_DIR(NAME, MODE) \
NOD(NAME, (S_IFDIR|(MODE)), NULL)
#define PROC_REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), fops)
#define PROC_TERM \
NOD(NULL, 0, NULL)
static const struct procfs_entry tid_entry_stuff[];
static const struct procfs_entry pid_entry_stuff[];
static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
struct list_head children;
int osnum;
char *data;
char name[0];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
DEFINE_SEMAPHORE(procfs_file_list_lock);
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
{
char *w = buf + bufsize - 1;
*w = '\0';
for(;;){
int l = strlen(e->name);
w -= l;
memcpy(w, e->name, l);
e = e->parent;
if(!e)
return w;
w--;
*w = '/';
}
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void
procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
static struct procfs_list_entry *
find_procfs_entry(struct procfs_list_entry *parent, const char *name)
{
struct list_head *list;
struct procfs_list_entry *e;
if(parent == NULL)
list = &procfs_file_list;
else
list = &parent->children;
list_for_each_entry(e, list, list) {
if(!strcmp(e->name, name))
return e;
}
return NULL;
}
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *n;
list_del(&top->list);
list_for_each_entry_safe(e, n, &top->children, list) {
delete_procfs_entries(e);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)
kfree(top->data);
kfree(top);
}
static struct procfs_list_entry *
add_procfs_entry(struct procfs_list_entry *parent, const char *name, int mode,
kuid_t uid, kgid_t gid, const void *opaque)
{
struct procfs_list_entry *e = find_procfs_entry(parent, name);
struct proc_dir_entry *pde;
struct proc_dir_entry *parent_pde = NULL;
int f_mode = mode & 0777;
if(e)
delete_procfs_entries(e);
e = kmalloc(sizeof(struct procfs_list_entry) + strlen(name) + 1,
GFP_KERNEL);
if(!e){
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
memset(e, '\0', sizeof(struct procfs_list_entry));
INIT_LIST_HEAD(&e->children);
strcpy(e->name, name);
if(parent)
parent_pde = parent->entry;
if (mode & S_IFDIR) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent_pde);
#else
pde = proc_mkdir_data(name, f_mode, parent_pde, e);
#endif
}
else if ((mode & S_IFLNK) == S_IFLNK) {
pde = proc_symlink(name, parent_pde, (char *)opaque);
}
else {
const struct file_operations *fop;
if(opaque)
fop = (const struct file_operations *)opaque;
else if(mode & S_IWUSR)
fop = &mckernel_forward;
else
fop = &mckernel_forward_ro;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, f_mode, parent_pde);
if(pde)
pde->proc_fops = fop;
#else
pde = proc_create_data(name, f_mode, parent_pde, fop, e);
if(pde)
proc_set_user(pde, uid, gid);
#endif
}
if(!pde){
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", name);
kfree(e);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->uid = uid;
pde->gid = gid;
pde->data = e;
#endif
if(parent)
e->osnum = parent->osnum;
e->entry = pde;
e->parent = parent;
list_add(&(e->list), parent? &(parent->children): &procfs_file_list);
return e;
}
static void
add_procfs_entries(struct procfs_list_entry *parent,
const struct procfs_entry *entries, kuid_t uid, kgid_t gid)
{
const struct procfs_entry *p;
for(p = entries; p->name; p++){
add_procfs_entry(parent, p->name, p->mode, uid, gid, p->fops);
}
}
static const struct cred *
get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
return __task_cred(task);
}
}
return NULL;
}
static struct procfs_list_entry *
find_base_entry(int osnum)
{
char name[12];
sprintf(name, "mcos%d", osnum);
return find_procfs_entry(NULL, name);
}
static struct procfs_list_entry *
find_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_base_entry(osnum)))
return NULL;
sprintf(name, "%d", pid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
find_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_pid_entry(osnum, pid)))
return NULL;
if(!(e = find_procfs_entry(e, "task")))
return NULL;
sprintf(name, "%d", tid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
get_base_entry(int osnum)
{
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
e = find_procfs_entry(NULL, name);
if(!e){
e = add_procfs_entry(NULL, name, S_IFDIR | 0555,
uid, gid, NULL);
e->osnum = osnum;
}
return e;
}
static struct procfs_list_entry *
get_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static struct procfs_list_entry *
get_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
if(!(parent = find_procfs_entry(parent, name)))
return NULL;
if(!(parent = find_procfs_entry(parent, "task")))
return NULL;
sprintf(name, "%d", tid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static void
_add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *exe;
parent = get_tid_entry(osnum, pid, tid);
if(parent){
add_procfs_entries(parent, tid_entry_stuff,
cred->uid, cred->gid);
exe = find_procfs_entry(parent->parent->parent, "exe");
if(exe){
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
cred->uid, cred->gid, exe->data);
}
}
}
void
add_tid_entry(int osnum, int pid, int tid)
{
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
up(&procfs_file_list_lock);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
up(&procfs_file_list_lock);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
delete_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
struct procfs_list_entry *e;
e = add_procfs_entry(parent, "exe", S_IFLNK | 0777, uid, gid,
path);
e->data = kmalloc(strlen(path) + 1, GFP_KERNEL);
strcpy(e->data, path);
task = find_procfs_entry(parent, "task");
list_for_each_entry(parent, &task->children, list) {
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
uid, gid, path);
}
}
up(&procfs_file_list_lock);
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
up(&procfs_file_list_lock);
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void
procfs_exit(int osnum)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if (e) {
delete_procfs_entries(e);
}
up(&procfs_file_list_lock);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static loff_t
mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
static const struct file_operations mckernel_forward = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = mckernel_procfs_write,
};
static const struct procfs_entry tid_entry_stuff[] = {
// PROC_REG("auxv", S_IRUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("pagemap", S_IRUGO, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
// PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("coredump_filter", S_IRUGO|S_IWUSR, NULL),
PROC_REG("cpuset", S_IXUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
PROC_REG("pagemap", S_IRUGO, NULL),
PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
PROC_DIR("task", S_IRUGO|S_IXUGO),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry base_entry_stuff[] = {
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("cpuinfo", S_IRUGO, NULL),
// PROC_REG("meminfo", S_IRUGO, NULL),
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
// PROC_REG("softirq", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("uptime", S_IRUGO, NULL),
// PROC_REG("version", S_IRUGO, NULL),
// PROC_REG("vmallocinfo",S_IRUSR, NULL),
// PROC_REG("vmstat", S_IRUGO, NULL),
// PROC_REG("zoneinfo", S_IRUGO, NULL),
PROC_TERM
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCCTRL_SYSFS_H
#define MCCTRL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfsm_ops {
ssize_t (*show)(struct sysfsm_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfsm_ops *ops, void *instance,
const void *buf, size_t bufsize);
void (*release)(struct sysfsm_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfsm_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfsm_createf(ihk_os_t os, struct sysfsm_ops *ops, void *instance,
int mode, const char *fmt, ...);
extern int sysfsm_mkdirf(ihk_os_t os, sysfs_handle_t *dirhp,
const char *fmt, ...);
extern int sysfsm_symlinkf(ihk_os_t os, sysfs_handle_t targeth,
const char *fmt, ...);
extern int sysfsm_lookupf(ihk_os_t os, sysfs_handle_t *objhp,
const char *fmt, ...);
extern int sysfsm_unlinkf(ihk_os_t os, int flags, const char *fmt, ...);
extern void sysfsm_cleanup(ihk_os_t os);
extern void sysfsm_packet_handler(void *os, int msg, int err, long arg1,
long arg2);
#endif /* MCCTRL_SYSFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -0,0 +1,40 @@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
.PHONY: clean install modules
modules:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
endif
clean:
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
install:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make install)
endif

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,416 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,951 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,438 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,557 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,488 +0,0 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include "mcctrl.h"
#include <linux/version.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
ihk_os_t os;
int osnum;
int pid;
int cpu;
char fname[PROCFS_NAME_MAX];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
loff_t mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
static const struct file_operations mckernel_procfs_file_operations = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
/**
* \brief Return specified procfs entry.
*
* \param p a name of the procfs file
* \param osnum os number
* \param mode if zero create a directory otherwise a file
*
* return value: NULL: Something wrong has occurred.
* otherwise: address of the proc_dir_entry structure of the procfs file
*
* p should not be NULL nor terminated by "/".
*
* We create a procfs entry if there is not already one.
* This process is recursive to the root of the procfs tree.
*/
/*
* XXX: Two or more entries which have same name can be created.
*
* get_procfs_list_entry() avoids creating an entry which has already been created.
* But, it allows creating an entry which is being created by another thread.
*
* This problem occurred when two requests which created files with a common
* ancestor directory which was not explicitly created were racing.
*/
static struct procfs_list_entry *get_procfs_list_entry(char *p, int osnum, int mode)
{
char *r;
struct proc_dir_entry *pde = NULL;
struct procfs_list_entry *e, *ret = NULL, *parent = NULL;
char name[PROCFS_NAME_MAX];
unsigned long irqflags;
dprintk("get_procfs_list_entry: %s for osnum %d mode %o\n", p, osnum, mode);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if (e == NULL) {
kprintf("ERROR: The procfs_file_list has a null entry.\n");
return NULL;
}
if (strncmp(e->fname, p, PROCFS_NAME_MAX) == 0) {
/* We found the entry */
ret = e;
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
if (ret != NULL) {
return ret;
}
r = strrchr(p, '/');
if (r != NULL) {
/* We have non-null parent dir. */
strncpy(name, p, r - p);
name[r - p] = '\0';
parent = get_procfs_list_entry(name, osnum, 0);
if (parent == NULL) {
/* We counld not get a parent procfs entry. Give up.*/
return NULL;
}
}
ret = kmalloc(sizeof(struct procfs_list_entry), GFP_KERNEL);
if (ret == NULL) {
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
/* Fill the fname field of the entry */
strncpy(ret->fname, p, PROCFS_NAME_MAX);
if (r != NULL) {
strncpy(name, r + 1, p + PROCFS_NAME_MAX - r - 1);
} else {
strncpy(name, p, PROCFS_NAME_MAX);
}
if (mode == 0) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent ? parent->entry : NULL);
#else
pde = proc_mkdir_data(name, 0555, parent ? parent->entry : NULL, ret);
#endif
} else {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, mode, parent->entry);
if (pde)
pde->proc_fops = &mckernel_procfs_file_operations;
#else
pde = proc_create_data(name, mode, parent->entry,
&mckernel_procfs_file_operations, ret);
#endif
}
if (pde == NULL) {
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", p);
kfree(ret);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->data = ret;
#endif
ret->osnum = osnum;
ret->entry = pde;
ret->parent = parent;
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_add(&(ret->list), &procfs_file_list);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
dprintk("get_procfs_list_entry: %s done\n", p);
return ret;
}
/**
* \brief Create a procfs entry.
*
* \param __os (opeque) os variable
* \param ref cpuid of the requesting mckernel process
* \param osnum osnum of the requesting mckernel process
* \param pid pid of the requesting mckernel process
* \param arg sent argument
*/
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg)
{
struct procfs_list_entry *e;
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
int mode;
char name[PROCFS_NAME_MAX];
dprintk("procfs_create: osnum: %d, cpu: %d, pid: %d\n", osnum, ref, pid);
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("name: %s mode: %o\n", f->fname, f->mode);
strncpy(name, f->fname, PROCFS_NAME_MAX);
mode = f->mode;
if (name[PROCFS_NAME_MAX - 1] != '\0') {
printk("ERROR: procfs_creat: file name not properly terminated.\n");
goto quit;
}
e = get_procfs_list_entry(name, osnum, mode);
if (e == NULL) {
printk("ERROR: could not create a procfs entry for %s.\n", name);
goto quit;
}
e->os = __os;
e->cpu = ref;
e->pid = pid;
quit:
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_create: done\n");
}
/**
* \brief Delete a procfs entry.
*
* \param __os (opaque) os variable
* \param osnum os number
* \param arg sent argument
*/
void procfs_delete(void *__os, int osnum, unsigned long arg)
{
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
struct procfs_list_entry *e;
struct procfs_list_entry *parent = NULL;
char name[PROCFS_NAME_MAX];
char *r;
unsigned long irqflags;
dprintk("procfs_delete: \n");
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("fname: %s.\n", f->fname);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if ((strncmp(e->fname, f->fname, PROCFS_NAME_MAX) == 0) &&
(e->osnum == osnum)) {
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
kfree(e);
r = strrchr(f->fname, '/');
if (r == NULL) {
strncpy(name, f->fname, PROCFS_NAME_MAX);
} else {
strncpy(name, r + 1, PROCFS_NAME_MAX);
}
dprintk("found and remove %s from the list.\n", name);
remove_proc_entry(name, parent->entry);
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_delete: done\n");
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer;
int order = 0;
volatile struct procfs_read *r;
struct ikc_scd_packet isp;
int ret, retrycount = 0;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
e->fname, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
return -ENOMEM;
}
retry:
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
strncpy((char *)r->fname, e->fname, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = e->cpu;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(e->os, e->cpu, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if ((r->ret == 0) && (r->eof != 1)) {
/* A miss-hit caused by migration has occurred.
* We simply retry the query with a new CPU.
*/
if (retrycount++ > 10) {
kprintf("ERROR: mckernel_procfs_read: excessive retry.\n");
goto out;
}
e->cpu = r->newcpu;
dprintk("retry\n");
goto retry;
}
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
free_pages((uintptr_t)kern_buffer, order);
kfree((void *)r);
return ret;
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void procfs_init(int osnum) {
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void procfs_exit(int osnum) {
char buf[20], *r;
int error;
mm_segment_t old_fs = get_fs();
struct kstat stat;
struct procfs_list_entry *parent;
struct procfs_list_entry *e, *temp = NULL;
unsigned long irqflags;
dprintk("remove remaining mckernel procfs files.\n");
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry_safe(e, temp, &procfs_file_list, list) {
if (e->osnum == osnum) {
dprintk("found entry for %s.\n", e->fname);
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
r = strrchr(e->fname, '/');
if (r == NULL) {
r = e->fname;
} else {
r += 1;
}
if (parent) {
remove_proc_entry(r, parent->entry);
}
dprintk("free the entry\n");
kfree(e);
}
dprintk("iterate it.\n");
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
sprintf(buf, "/proc/mcos%d", osnum);
set_fs(KERNEL_DS);
error = vfs_stat (buf, &stat);
set_fs(old_fs);
if (error != 0) {
return;
}
printk("procfs_exit: We have to remove unexpectedly remaining %s.\n", buf);
/* remove remnant of previous mcos%d */
remove_proc_entry(buf + 6, NULL);
}

View File

@ -1,13 +1,19 @@
CC=@CC@
BINDIR=@BINDIR@
CFLAGS=-Wall -O -fPIE -pie
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@
TARGET=mcexec
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
all: $(TARGET)
mcexec: mcexec.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -pthread -o $@ $^ $(EXTRA_OBJS)
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
clean:
$(RM) $(TARGET) *.o
@ -17,4 +23,5 @@ clean:
install:
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

1070
executer/user/eclair.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -59,7 +59,12 @@
#include <semaphore.h>
#include <signal.h>
#include <sys/signalfd.h>
#include <sys/mount.h>
#include <include/generated/uapi/linux/version.h>
#include <sys/user.h>
#include "../include/uprotocol.h"
#include <getopt.h>
#include "../config.h"
//#define DEBUG
@ -96,6 +101,19 @@ int __glob_argc = -1;
char **__glob_argv = 0;
#endif
#ifdef ENABLE_MCOVERLAYFS
#undef ENABLE_MCOVERLAYFS
#ifndef RHEL_RELEASE_CODE
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
#endif // RHEL_RELEASE_CODE == 7.2
#endif // RHEL_RELEASE_CODE
#endif // ENABLE_MCOVERLAYFS
typedef unsigned char cc_t;
typedef unsigned int speed_t;
typedef unsigned int tcflag_t;
@ -129,6 +147,7 @@ static char *exec_path = NULL;
static char *altroot;
static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK";
static int ischild;
static int enable_vdso = 1;
struct fork_sync {
pid_t pid;
@ -183,6 +202,9 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
desc = malloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs);
memset(desc, '\0', sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs);
desc->shell_path[0] = '\0';
fseek(fp, hdr.e_phoff, SEEK_SET);
j = 0;
desc->num_sections = nhdrs;
@ -242,6 +264,8 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
}
desc->pid = getpid();
desc->pgid = getpgid(0);
if(*interp_pathp)
desc->reloc = hdr.e_type == ET_DYN;
desc->entry = hdr.e_entry;
ioctl(fd, MCEXEC_UP_GET_CREDV, desc->cred);
desc->at_phdr = load_addr + hdr.e_phoff;
@ -364,7 +388,7 @@ struct program_load_desc *load_interp(struct program_load_desc *desc0, FILE *fp)
unsigned char *dma_buf;
int lookup_exec_path(char *filename, char *path, int max_len)
int lookup_exec_path(char *filename, char *path, int max_len, int execvp)
{
int found;
int error;
@ -382,28 +406,27 @@ retry:
char *token, *string, *tofree;
char *PATH = getenv("COKERNEL_PATH");
if (!PATH) {
if (!execvp) {
if (strlen(filename) + 1 > max_len) {
return ENAMETOOLONG;
}
strcpy(path, filename);
error = access(path, X_OK);
if (error) {
return errno;
}
found = 1;
break;
}
if (!(PATH = getenv("COKERNEL_PATH"))) {
PATH = getenv("PATH");
}
if (strlen(filename) >= 255) {
return ENAMETOOLONG;
}
/* See first whether file is available in current working dir */
error = access(filename, X_OK);
if (error == 0) {
__dprintf("lookup_exec_path(): found %s in cwd\n", filename);
error = snprintf(path, max_len, "%s", filename);
if (error < 0 || error >= max_len) {
fprintf(stderr, "lookup_exec_path(): array too small?\n");
return ENOMEM;
}
found = 1;
break;
}
__dprintf("PATH: %s\n", PATH);
@ -431,6 +454,9 @@ retry:
}
free(tofree);
if(!found){
return ENOENT;
}
break;
}
@ -477,7 +503,7 @@ retry:
}
if ((sb.st_mode & S_IFMT) == S_IFLNK) {
char *link_path = malloc(max_len);
link_path = malloc(max_len);
if (!link_path) {
fprintf(stderr, "lookup_exec_path(): error allocating\n");
return ENOMEM;
@ -488,9 +514,18 @@ retry:
fprintf(stderr, "lookup_exec_path(): error readlink\n");
return EINVAL;
}
link_path[error] = '\0';
__dprintf("lookup_exec_path(): %s is link -> %s\n", path, link_path);
if(link_path[0] != '/'){
char *t = strrchr(path, '/');
if(t){
t++;
strcpy(t, link_path);
strcpy(link_path, path);
}
}
filename = link_path;
goto retry;
}
@ -634,10 +669,7 @@ int load_elf_desc(char *filename, struct program_load_desc **desc_p,
return 0;
}
#define PAGE_SIZE 4096
#define PAGE_MASK ~((unsigned long)PAGE_SIZE - 1)
void transfer_image(int fd, struct program_load_desc *desc)
int transfer_image(int fd, struct program_load_desc *desc)
{
struct remote_transfer pt;
unsigned long s, e, flen, rpa;
@ -651,13 +683,17 @@ void transfer_image(int fd, struct program_load_desc *desc)
+ PAGE_SIZE - 1) & PAGE_MASK;
rpa = desc->sections[i].remote_pa;
fseek(fp, desc->sections[i].offset, SEEK_SET);
if (fseek(fp, desc->sections[i].offset, SEEK_SET) != 0) {
fprintf(stderr, "transfer_image(): error: seeking file position\n");
return -1;
}
flen = desc->sections[i].filesz;
__dprintf("seeked to %lx | size %ld\n",
desc->sections[i].offset, flen);
while (s < e) {
memset(&pt, '\0', sizeof pt);
pt.rphys = rpa;
pt.userp = dma_buf;
pt.size = PAGE_SIZE;
@ -672,7 +708,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
if (lr > flen) {
lr = flen;
}
fread(dma_buf + l, 1, lr, fp);
if (fread(dma_buf + l, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
else if (flen > 0) {
@ -681,7 +730,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
} else {
lr = flen;
}
fread(dma_buf, 1, lr, fp);
if (fread(dma_buf, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
s += PAGE_SIZE;
@ -697,6 +759,8 @@ void transfer_image(int fd, struct program_load_desc *desc)
}
}
}
return 0;
}
void print_desc(struct program_load_desc *desc)
@ -761,7 +825,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
}
/* Count full length */
full_len = sizeof(int) + sizeof(char *); // Counter and terminating NULL
full_len = sizeof(long) + sizeof(char *); // Counter and terminating NULL
if (first) {
full_len += sizeof(char *) + strlen(first) + 1;
}
@ -771,6 +835,8 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
full_len += sizeof(char *) + strlen(strings[string_i]) + 1;
}
full_len = (full_len + sizeof(long) - 1) & ~(sizeof(long) - 1);
_flat = (char *)malloc(full_len);
if (!_flat) {
return 0;
@ -779,14 +845,14 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
memset(_flat, 0, full_len);
/* Number of strings */
*((int*)_flat) = nr_strings + (first ? 1 : 0);
*((long *)_flat) = nr_strings + (first ? 1 : 0);
// Actual offset
flat_offset = sizeof(int) + sizeof(char *) * (nr_strings + 1 +
flat_offset = sizeof(long) + sizeof(char *) * (nr_strings + 1 +
(first ? 1 : 0));
if (first) {
*((char **)(_flat + sizeof(int))) = (void *)flat_offset;
*((char **)(_flat + sizeof(long))) = (void *)flat_offset;
memcpy(_flat + flat_offset, first, strlen(first) + 1);
flat_offset += strlen(first) + 1;
}
@ -794,7 +860,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
for (string_i = 0; string_i < nr_strings; ++string_i) {
/* Fabricate the string */
*((char **)(_flat + sizeof(int) + (string_i + (first ? 1 : 0))
*((char **)(_flat + sizeof(long) + (string_i + (first ? 1 : 0))
* sizeof(char *))) = (void *)flat_offset;
memcpy(_flat + flat_offset, strings[string_i], strlen(strings[string_i]) + 1);
flat_offset += strlen(strings[string_i]) + 1;
@ -817,7 +883,10 @@ struct thread_data_s {
pthread_mutex_t *lock;
pthread_barrier_t *init_ready;
} *thread_data;
int ncpu;
int n_threads;
pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@ -828,7 +897,7 @@ static void *main_loop_thread_func(void *arg)
struct thread_data_s *td = (struct thread_data_s *)arg;
td->tid = gettid();
td->remote_tid = (int)td->tid;
td->remote_tid = -1;
pthread_barrier_wait(&init_ready);
td->ret = main_loop(td->fd, td->cpu, td->lock);
@ -877,6 +946,7 @@ sendsig(int sig, siginfo_t *siginfo, void *context)
remote_tid = -1;
}
memset(&sigdesc, '\0', sizeof sigdesc);
sigdesc.cpu = cpu;
sigdesc.pid = (int)pid;
sigdesc.tid = remote_tid;
@ -903,13 +973,17 @@ act_signalfd4(struct syscall_wait_desc *w)
switch(mode){
case 0: /* new signalfd */
sfd = malloc(sizeof(struct sigfd));
memset(sfd, '\0', sizeof(struct sigfd));
tmp = w->sr.args[1];
flags = 0;
if(tmp & SFD_NONBLOCK)
flags |= O_NONBLOCK;
if(tmp & SFD_CLOEXEC)
flags |= O_CLOEXEC;
pipe2(sfd->sigpipe, flags);
if (pipe2(sfd->sigpipe, flags) < 0) {
perror("pipe2 failed:");
return -1;
}
sfd->next = sigfdtop;
sigfdtop = sfd;
rc = sfd->sigpipe[0];
@ -940,7 +1014,11 @@ act_signalfd4(struct syscall_wait_desc *w)
rc = -EBADF;
else{
info = (struct signalfd_siginfo *)w->sr.args[2];
write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo));
if (write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo))
!= sizeof(struct signalfd_siginfo)) {
fprintf(stderr, "error: writing sigpipe\n");
rc = -EBADF;
}
}
break;
}
@ -1046,9 +1124,9 @@ void init_worker_threads(int fd)
int i;
pthread_mutex_init(&lock, NULL);
pthread_barrier_init(&init_ready, NULL, ncpu + 2);
pthread_barrier_init(&init_ready, NULL, n_threads + 2);
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
int ret;
thread_data[i].fd = fd;
@ -1068,6 +1146,80 @@ void init_worker_threads(int fd)
pthread_barrier_wait(&init_ready);
}
#ifdef ENABLE_MCOVERLAYFS
#define READ_BUFSIZE 1024
static int isunshare(void)
{
int err = 0;
int ret;
int fd;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
break;
}
}
close(fd);
__dprintf("err=%d\n", err);
return err;
}
#endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0
#define MCK_RLIMIT_CORE 1
#define MCK_RLIMIT_CPU 2
@ -1138,6 +1290,24 @@ static int rlimits[] = {
char dev[64];
static struct option mcexec_options[] = {
{
.name = "disable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 0,
},
{
.name = "enable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 1,
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv)
{
// int fd;
@ -1189,12 +1359,15 @@ int main(int argc, char **argv)
}
/* Parse options ("+" denotes stop at the first non-option) */
while ((opt = getopt(argc, argv, "+c:")) != -1) {
while ((opt = getopt_long(argc, argv, "+c:", mcexec_options, NULL)) != -1) {
switch (opt) {
case 'c':
target_core = atoi(optarg);
break;
case 0: /* long opt */
break;
default: /* '?' */
print_usage(argv);
exit(EXIT_FAILURE);
@ -1233,7 +1406,59 @@ int main(int argc, char **argv)
return 1;
}
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) {
#ifdef ENABLE_MCOVERLAYFS
__dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX];
error = isunshare();
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
unshare_desc.unshare_flags = CLONE_NEWNS;
if (ioctl(fd, MCEXEC_UP_SYS_UNSHARE,
(unsigned long)&unshare_desc) != 0) {
fprintf(stderr, "Error: Failed to unshare. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /proc. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_sysdir, "/tmp/mcos/mcos%d_sys", mcosid);
mount_desc.dev_name = mcos_sysdir;
mount_desc.dir_name = "/sys";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /sys. (%s)\n",
strerror(errno));
return 1;
}
} else if (error == -1) {
return 1;
}
#else
__dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path), 1) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1;
}
@ -1245,7 +1470,7 @@ int main(int argc, char **argv)
/* Check whether shell script */
if (shell) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path)) != 0) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path), 0) != 0) {
fprintf(stderr, "error: finding file: %s\n", shell);
return 1;
}
@ -1271,6 +1496,8 @@ int main(int argc, char **argv)
//print_flat(args);
desc->cpu = target_core;
desc->enable_vdso = enable_vdso;
p = getenv(rlimit_stack_envname);
if (p) {
errno = 0;
@ -1305,6 +1532,19 @@ int main(int argc, char **argv)
return 1;
}
n_threads = ncpu;
if (ncpu > 16) {
n_threads = 16;
}
/*
* XXX: keep thread_data ncpu sized despite that there are only
* n_threads worker threads in the pool so that signaling code
* keeps working.
*
* TODO: fix signaling code to be independent of TIDs.
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
@ -1347,7 +1587,10 @@ int main(int argc, char **argv)
}
print_desc(desc);
transfer_image(fd, desc);
if (transfer_image(fd, desc) < 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
fflush(stdout);
fflush(stderr);
@ -1386,7 +1629,7 @@ int main(int argc, char **argv)
return 1;
}
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
pthread_join(thread_data[i].thread_id, NULL);
}
@ -1400,6 +1643,7 @@ void do_syscall_return(int fd, int cpu,
{
struct syscall_ret_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu;
desc.ret = ret;
desc.src = src;
@ -1416,6 +1660,7 @@ void do_syscall_load(int fd, int cpu, unsigned long dest, unsigned long src,
{
struct syscall_load_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu;
desc.src = src;
desc.dest = dest;
@ -1446,16 +1691,14 @@ do_generic_syscall(
}
static void
kill_thread(unsigned long cpu)
kill_thread(unsigned long tid)
{
if(cpu >= 0 && cpu < ncpu){
pthread_kill(thread_data[cpu].thread_id, LOCALSIG);
}
else{
int i;
int i;
for (i = 0; i < ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
}
}
}
@ -1465,6 +1708,7 @@ static long do_strncpy_from_user(int fd, void *dest, void *src, unsigned long n)
struct strncpy_from_user_desc desc;
int ret;
memset(&desc, '\0', sizeof desc);
desc.dest = dest;
desc.src = src;
desc.n = n;
@ -1559,6 +1803,9 @@ int close_cloexec_fds(int mcos_fd)
char *
chgpath(char *in, char *buf)
{
#ifdef ENABLE_MCOVERLAYFS
return in;
#endif // ENABLE_MCOVERLAYFS
char *fn = in;
struct stat sb;
@ -1588,10 +1835,11 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
char *fn;
int sig;
int term;
struct timeval tv;
struct timespec tv;
char pathbuf[PATH_MAX];
char tmpbuf[PATH_MAX];
memset(&w, '\0', sizeof w);
w.cpu = cpu;
w.pid = getpid();
@ -1607,6 +1855,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
//pthread_mutex_lock(lock);
thread_data[cpu].remote_tid = w.sr.rtid;
switch (w.sr.number) {
case __NR_open:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
@ -1627,13 +1877,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
break;
case __NR_futex:
ret = gettimeofday(&tv, NULL);
ret = clock_gettime(w.sr.args[1], &tv);
SET_ERR(ret);
__dprintf("gettimeofday=%016ld,%09ld\n",
__dprintf("clock_gettime=%016ld,%09ld\n",
tv.tv_sec,
tv.tv_usec);
tv.tv_nsec);
do_syscall_return(fd, cpu, ret, 1, (unsigned long)&tv,
w.sr.args[0], sizeof(struct timeval));
w.sr.args[0], sizeof(struct timespec));
break;
case __NR_kill: // interrupt syscall
@ -1645,13 +1895,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
sig = 0;
term = 0;
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
/* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
}
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){
@ -1689,17 +1939,12 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
return w.sr.args[0];
case __NR_mmap:
case __NR_munmap:
case __NR_mprotect:
/* reserved for internal use */
do_syscall_return(fd, cpu, -ENOSYS, 0, 0, 0, 0);
break;
case __NR_munmap:
ret = madvise((void *)w.sr.args[0], w.sr.args[1], MADV_DONTNEED);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
#ifdef USE_SYSCALL_MOD_CALL
case 303:{
__dprintf("mcexec.c,mod_cal,mod=%ld,cmd=%ld\n", w.sr.args[0], w.sr.args[1]);
@ -1709,22 +1954,40 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
#endif
case __NR_gettid:{
int mode = w.sr.args[0];
int remote_pid = w.sr.args[1];
int newcpuid = w.sr.args[2];
int oldcpuid = w.sr.args[3];
int wtid = thread_data[newcpuid].remote_tid;
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
*/
if (w.sr.args[4] > 0) {
struct remote_transfer trans;
int i = 0;
int *tids = malloc(sizeof(int) * w.sr.args[4]);
if (!tids) {
fprintf(stderr, "__NR_gettid(): error allocating TIDs\n");
goto gettid_out;
}
if(mode == 0){
thread_data[ncpu].remote_tid = wtid;
thread_data[newcpuid].remote_tid = remote_pid;
}
else if(mode == 2){
thread_data[newcpuid].remote_tid = thread_data[oldcpuid].remote_tid;
thread_data[oldcpuid].remote_tid = wtid;
}
for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) {
tids[i] = thread_data[i].tid;
}
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
for (; i < ncpu; ++i) {
tids[i] = 0;
}
trans.userp = (void*)tids;
trans.rphys = w.sr.args[5];
trans.size = sizeof(int) * w.sr.args[4];
trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE;
if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
}
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
break;
}
@ -1733,6 +1996,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
struct fork_sync_container *fsc;
struct fork_sync_container *fp;
struct fork_sync_container *fb;
int flag = w.sr.args[0];
int rc = -1;
pid_t pid;
@ -1752,7 +2016,45 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
memset(fs, '\0', sizeof(struct fork_sync));
sem_init(&fs->sem, 1, 0);
pid = fork();
if(flag){
int pipefds[2];
if(pipe(pipefds) == -1){
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
pid = fork();
if(pid == 0){
close(pipefds[0]);
pid = fork();
if(pid != 0){
if (write(pipefds[1], &pid, sizeof pid) != sizeof(pid)) {
fprintf(stderr, "error: writing pipefds\n");
}
exit(0);
}
}
else if(pid != -1){
int npid;
int st;
close(pipefds[1]);
if (read(pipefds[0], &npid, sizeof npid) != sizeof(npid)) {
fprintf(stderr, "error: reading pipefds\n");
}
close(pipefds[0]);
waitpid(pid, &st, 0);
pid = npid;
}
else{
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
}
else
pid = fork();
switch (pid) {
/* Error */
@ -1780,7 +2082,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
/* Reinit signals and syscall threads */
init_sigaction();
init_worker_threads(fd);
__dprintf("pid(%d): signals and syscall threads OK\n",
getpid());
@ -1794,6 +2095,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
goto fork_child_sync_pipe;
}
init_worker_threads(fd);
fork_child_sync_pipe:
sem_post(&fs->sem);
if (fs->status)
@ -1879,7 +2182,7 @@ fork_err:
}
if(ret != pid) {
fprintf(stderr, "ERROR: waiting for %lu\n", w.sr.args[0]);
fprintf(stderr, "ERROR: waiting for %lu rc=%d errno=%d\n", w.sr.args[0], ret, errno);
}
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
@ -1895,15 +2198,16 @@ fork_err:
char path[1024];
char *filename;
int ret;
char *shell = NULL;
char *shell;
char shell_path[1024];
/* Load descriptor phase */
case 1:
shell = NULL;
filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path)))
if ((ret = lookup_exec_path(filename, path, sizeof(path), 0))
!= 0) {
goto return_execve1;
}
@ -1917,7 +2221,7 @@ fork_err:
/* Check whether shell script */
if (shell) {
if ((ret = lookup_exec_path(shell, shell_path,
sizeof(shell_path))) != 0) {
sizeof(shell_path), 0)) != 0) {
fprintf(stderr, "execve(): error: finding file: %s\n", shell);
goto return_execve1;
}
@ -1938,6 +2242,7 @@ fork_err:
strcpy(desc->shell_path, shell_path);
}
desc->enable_vdso = enable_vdso;
__dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n",
path, desc->num_sections);
@ -1978,6 +2283,7 @@ return_execve1:
fprintf(stderr, "execve(): error allocating desc\n");
goto return_execve2;
}
memset(desc, '\0', w.sr.args[2]);
/* Copy descriptor from co-kernel side */
trans.userp = (void*)desc;
@ -1994,7 +2300,10 @@ return_execve1:
__dprintf("%s", "execve(): transfer ELF desc OK\n");
transfer_image(fd, desc);
if (transfer_image(fd, desc) != 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
__dprintf("%s", "execve(): image transferred\n");
if (close_cloexec_fds(fd) < 0) {
@ -2020,6 +2329,11 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_perf_event_open:
ret = open("/dev/null", O_RDONLY);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_rt_sigaction:
act_sigaction(&w);
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
@ -2041,6 +2355,53 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresuid:
ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setreuid:
ret = setreuid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setuid:
ret = setuid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresgid:
ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setregid:
ret = setregid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setgid:
ret = setgid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setfsgid:
ret = setfsgid(w.sr.args[0]);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_close:
if(w.sr.args[0] == fd)
ret = -EBADF;
@ -2049,13 +2410,34 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_readlink:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
if (ret >= PATH_MAX) {
ret = -ENAMETOOLONG;
}
if (ret < 0) {
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
}
fn = chgpath(pathbuf, tmpbuf);
ret = readlink(fn, (char *)w.sr.args[1], w.sr.args[2]);
__dprintf("readlink: path=%s, buf=%s, ret=%ld\n",
fn, (char *)w.sr.args[1], ret);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
default:
ret = do_generic_syscall(&w);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
}
thread_data[cpu].remote_tid = -1;
//pthread_mutex_unlock(lock);
}
__dprint("timed out.\n");

View File

@ -3,7 +3,7 @@ OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__
CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS)
ifeq ("$(DCFA_MODE)", "kmod")

View File

@ -3,10 +3,10 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ -g
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o

View File

@ -24,20 +24,23 @@
#include <process.h>
#include <init.h>
#include <march.h>
#include <cls.h>
int num_processors = 1;
static volatile int ap_stop = 1;
static void ap_wait(void)
{
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
init_tick();
while (ap_stop) {
barrier();
cpu_pause();
}
sync_tick();
kmalloc_init();
sched_init();
arch_start_pvclock();
if (find_command_line("hidos")) {
init_host_syscall_channel();
@ -53,19 +56,20 @@ static void ap_wait(void)
void ap_start(void)
{
init_tick();
ap_stop = 0;
sync_tick();
}
void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
cpu_info = ihk_mc_get_cpu_info();
bsp_hw_id = ihk_mc_get_hardware_processor_id();
@ -74,18 +78,164 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d, ", bsp_hw_id);
kprintf("AP Booting :");
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
kprintf(" %d", cpu_info->hw_ids[i]);
num_processors++;
}
kprintf(" .. Done\n");
kprintf("AP Booting: Done\n");
}
#include <sysfs.h>
#include <kmalloc.h>
#include <string.h>
#include <vsprintf.h>
static ssize_t
show_int(struct sysfs_ops *ops, void *instance, void *buf, size_t size)
{
int *p = instance;
return snprintf(buf, size, "%d\n", *p);
}/* show_int() */
struct sysfs_ops show_int_ops = {
.show = &show_int,
};
struct fake_cpu_info {
int online;
};
static struct fake_cpu_info *fake_cpu_infos = NULL;
enum fake_cpu_info_member {
ONLINE,
};
struct fake_cpu_info_ops {
enum fake_cpu_info_member member;
struct sysfs_ops ops;
};
static ssize_t
show_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
n = snprintf(buf, size, "%d\n", info->online);
break;
default:
n = -EINVAL;
break;
}
if (n >= size) {
n = -ENOSPC;
}
return n;
} /* show_fake_cpu_info() */
static ssize_t
store_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
kprintf("NYI:store_fake_cpu_info(%p,%p,%p,%ld): "
"online %d --> \"%.*s\"\n",
ops0, instance, buf, size, info->online,
(int)size, buf);
n = size;
break;
default:
n = -EIO;
break;
}
return n;
} /* store_fake_cpu_info() */
static struct fake_cpu_info_ops show_fci_online = {
.member = ONLINE,
.ops.show = &show_fake_cpu_info,
.ops.store = &store_fake_cpu_info,
};
void
cpu_sysfs_setup(void)
{
int error;
int cpu;
sysfs_handle_t targeth;
struct fake_cpu_info *info;
/* sample of simple variable **********************************/
error = sysfs_createf(&show_int_ops, &num_processors, 0444,
"/sys/devices/system/cpu/num_processors");
if (error) {
panic("cpu_sysfs_setup:sysfs_createf(num_processors) failed\n");
}
/* sample of more complex variable ****************************/
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
}
fake_cpu_infos = info;
/* setup sysfs tree */
for (cpu = 0; cpu < num_processors; ++cpu) {
/* online */
error = sysfs_createf(&show_fci_online.ops,
&fake_cpu_infos[cpu], 0644,
"/sys/devices/system/cpu/cpu%d/online", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_createf failed\n");
}
/* link to cpu%d */
error = sysfs_lookupf(&targeth,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_lookupf failed\n");
}
error = sysfs_symlinkf(targeth, "/sys/bus/cpu/devices/cpu%d",
cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_symlinkf failed\n");
}
}
return;
} /* cpu_sysfs_setup() */

View File

@ -23,6 +23,7 @@
extern int num_processors;
struct cpu_local_var *clv;
static int cpu_local_var_initialized = 0;
void cpu_local_var_init(void)
{
@ -31,11 +32,24 @@ void cpu_local_var_init(void)
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}
struct cpu_local_var *get_cpu_local_var(int id)
{
return clv + id;
}
void preempt_enable(void)
{
if (cpu_local_var_initialized)
--cpu_local_var(no_preempt);
}
void preempt_disable(void)
{
if (cpu_local_var_initialized)
++cpu_local_var(no_preempt);
}

View File

@ -26,6 +26,7 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)

View File

@ -26,6 +26,7 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)

View File

@ -26,6 +26,7 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)

View File

@ -1,6 +1,5 @@
CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -3,6 +3,5 @@ LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump
OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/builtin-x86.lds

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/smp-x86.lds

View File

@ -26,6 +26,7 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
@ -41,9 +42,4 @@ SECTIONS
}
. = ALIGN(4096);
_end = .;
/DISCARD/ : {
*(.eh_frame)
*(.note.gnu.build-id)
}
}

View File

@ -22,13 +22,46 @@ extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern int sprintf(char * buf, const char *fmt, ...);
static ihk_spinlock_t kmsg_lock;
static unsigned long kprintf_lock_head(void);
static void kprintf_unlock_head(unsigned long irqflags);
static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
int head, tail, buf_len, mode, adj;
mode = kmsg_buf.mode;
while (1) {
adj = 0;
tail = kmsg_buf.tail;
buf_len = kmsg_buf.len;
head = kmsg_buf.head;
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
} else {
kprintf_unlock_head(*flags_head);
*flags_head = kprintf_lock_head();
}
} else {
break;
}
}
}
/* TODO: lock */
void kputs(char *buf)
{
int len = strlen(buf);
unsigned long flags;
int slide = 0;
unsigned long flags_tail, flags_head;
flags = ihk_mc_spinlock_lock(&kmsg_lock);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
if (len + kmsg_buf.tail > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -39,60 +72,57 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
}
#define KPRINTF_LOCAL_BUF_LEN 1024
unsigned long kprintf_lock(void)
{
return ihk_mc_spinlock_lock(&kmsg_lock);
return __ihk_mc_spinlock_lock(&kmsg_lock);
}
void kprintf_unlock(unsigned long irqflags)
{
ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
__ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
}
static unsigned long kprintf_lock_head(void)
{
return __ihk_mc_spinlock_lock(&kmsg_buf.lock);
}
static void kprintf_unlock_head(unsigned long irqflags)
{
__ihk_mc_spinlock_unlock(&kmsg_buf.lock, irqflags);
}
/* Caller must hold kmsg_lock! */
int __kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
va_list va;
unsigned long flags;
char buf[KPRINTF_LOCAL_BUF_LEN];
flags = ihk_mc_spinlock_lock(&kmsg_lock);
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -100,16 +130,69 @@ int kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
kprintf_unlock_head(flags_head);
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_tail, flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
return len;
}
void kmsg_init(void)
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);
kmsg_buf.tail = 0;
kmsg_buf.len = sizeof(kmsg_buf.str);
kmsg_buf.head = 0;
kmsg_buf.mode = mode;
ihk_mc_spinlock_init(&kmsg_buf.lock);
memset(kmsg_buf.str, 0, kmsg_buf.len);
}

View File

@ -78,51 +78,52 @@ static struct memobj *to_memobj(struct devobj *devobj)
/***********************************************************************
* devobj
*/
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags)
{
ihk_mc_user_context_t ctx;
struct pager_map_result result; // XXX: assumes contiguous physical
int error;
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
goto out;
}
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = len;
ihk_mc_syscall_arg3(&ctx) = off;
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
dkprintf("devobj_create:handle: %lx\n", result.handle);
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -140,11 +141,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out:
if (obj) {
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(obj);
}
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
return error;
}
@ -164,6 +166,8 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL;
uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
@ -192,7 +196,7 @@ static void devobj_release(struct memobj *memobj)
}
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(free_obj);
}
@ -204,7 +208,7 @@ static void devobj_release(struct memobj *memobj)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
const off_t pgoff = off / PAGE_SIZE;
struct devobj *obj = to_devobj(memobj);
int error;
uintptr_t pfn;
@ -216,7 +220,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
goto out;
}
ix = pgoff - obj->pfn_pgoff;

View File

@ -47,6 +47,7 @@ static memobj_get_page_func_t fileobj_get_page;
static memobj_copy_page_func_t fileobj_copy_page;
static memobj_flush_page_func_t fileobj_flush_page;
static memobj_invalidate_page_func_t fileobj_invalidate_page;
static memobj_lookup_page_func_t fileobj_lookup_page;
static struct memobj_ops fileobj_ops = {
.release = &fileobj_release,
@ -55,6 +56,7 @@ static struct memobj_ops fileobj_ops = {
.copy_page = &fileobj_copy_page,
.flush_page = &fileobj_flush_page,
.invalidate_page = &fileobj_invalidate_page,
.lookup_page = &fileobj_lookup_page,
};
static struct fileobj *to_fileobj(struct memobj *memobj)
@ -180,7 +182,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("fileobj_create(%d):create failed. %d\n", fd, error);
dkprintf("fileobj_create(%d):create failed. %d\n", fd, error);
goto out;
}
@ -256,13 +258,24 @@ static void fileobj_release(struct memobj *memobj)
/* zap page_list */
for (;;) {
struct page *page;
int count;
void *page_va;
page = page_list_first(obj);
if (!page) {
break;
}
page_list_remove(obj, page);
page_va = phys_to_virt(page_to_phys(page));
if (ihk_atomic_read(&page->count) != 1) {
kprintf("%s: WARNING: page count for phys 0x%lx is invalid\n",
__FUNCTION__, page->phys);
}
if (page_unmap(page)) {
ihk_mc_free_pages(page_va, 1);
}
#if 0
count = ihk_atomic_sub_return(1, &page->count);
if (!((page->mode == PM_WILL_PAGEIO)
@ -279,7 +292,7 @@ static void fileobj_release(struct memobj *memobj)
}
page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1);
#endif
}
obj_list_remove(free_obj);
ihk_mc_spinlock_unlock_noirq(&fileobj_list_lock);
@ -387,7 +400,7 @@ out:
static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct process *proc = cpu_local_var(current);
struct thread *proc = cpu_local_var(current);
struct fileobj *obj = to_fileobj(memobj);
int error;
void *virt = NULL;
@ -428,7 +441,7 @@ static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintp
goto out;
}
phys = virt_to_phys(virt);
page = phys_to_page(phys);
page = phys_to_page_insert_hash(phys);
if (page->mode != PM_NONE) {
panic("fileobj_get_page:invalid new page");
}
@ -500,10 +513,10 @@ static uintptr_t fileobj_copy_page(
memobj_lock(memobj);
for (;;) {
if (orgpage->mode != PM_MAPPED) {
if (!orgpage || orgpage->mode != PM_MAPPED) {
kprintf("fileobj_copy_page(%p,%lx,%d):"
"invalid cow page. %x\n",
memobj, orgpa, p2align, orgpage->mode);
memobj, orgpa, p2align, orgpage ? orgpage->mode : 0);
panic("fileobj_copy_page:invalid cow page");
}
count = ihk_atomic_read(&orgpage->count);
@ -525,7 +538,9 @@ static uintptr_t fileobj_copy_page(
memcpy(newkva, orgkva, pgsize);
ihk_atomic_dec(&orgpage->count);
newpa = virt_to_phys(newkva);
page_map(phys_to_page(newpa));
if (phys_to_page(newpa)) {
page_map(phys_to_page(newpa));
}
newkva = NULL; /* avoid ihk_mc_free_pages() */
break;
}
@ -561,6 +576,11 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
ssize_t ss;
page = phys_to_page(phys);
if (!page) {
kprintf("%s: warning: tried to flush non-existing page for phys addr: 0x%lx\n",
__FUNCTION__, phys);
return 0;
}
memobj_unlock(&obj->memobj);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_WRITE;
@ -609,3 +629,37 @@ out:
memobj, phys, pgsize, error);
return error;
}
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
uintptr_t phys = -1;
struct page *page;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
}
page = page_list_lookup(obj, off);
if (!page) {
error = -ENOENT;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
obj, off, p2align, physp, error, phys);
return error;
}

View File

@ -103,7 +103,7 @@ int futex_cmpxchg_enabled;
struct futex_q {
struct plist_node list;
struct process *task;
struct thread *task;
ihk_spinlock_t *lock_ptr;
union futex_key key;
union futex_key *requeue_pi_key;
@ -151,7 +151,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*/
static void get_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
@ -161,7 +161,7 @@ static void get_futex_key_refs(union futex_key *key)
*/
static void drop_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
/**
@ -183,6 +183,7 @@ static int
get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
unsigned long phys;
struct process_vm *mm = cpu_local_var(current)->vm;
/*
@ -201,15 +202,31 @@ get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key);
return 0;
}
/* RIKEN: No shared futex support... */
return -EFAULT;
key->both.offset |= FUT_OFF_MMSHARED;
retry_v2p:
/* Just use physical address of page, McKernel does not do swapping */
if (ihk_mc_pt_virt_to_phys(mm->address_space->page_table,
(void *)uaddr, &phys)) {
/* Check if we can fault in page */
if (page_fault_process_vm(mm, uaddr, PF_POPULATE | PF_WRITE | PF_USER)) {
kprintf("error: get_futex_key() virt to phys translation failed\n");
return -EFAULT;
}
goto retry_v2p;
}
key->shared.phys = (void *)phys;
key->shared.pgoff = 0;
return 0;
}
@ -232,7 +249,7 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
/* RIKEN: futexes are always on not swappable pages */
*dest = *from;
*dest = getint_user((int *)from);
return 0;
}
@ -243,7 +260,7 @@ static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
*/
static void wake_futex(struct futex_q *q)
{
struct process *p = q->task;
struct thread *p = q->task;
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
@ -263,7 +280,8 @@ static void wake_futex(struct futex_q *q)
barrier();
q->lock_ptr = NULL;
sched_wakeup_process(p, PS_NORMAL);
dkprintf("wake_futex(): waking up tid %d\n", p->tid);
sched_wakeup_thread(p, PS_NORMAL);
}
/*
@ -658,23 +676,27 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
xchg4(&(cpu_local_var(current)->ftn->status), PS_INTERRUPTIBLE);
xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE);
queue_me(q, hb);
if (!plist_node_empty(&q->list)) {
/* RIKEN: use mcos timers */
if (timeout) {
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
time_remain = schedule_timeout(timeout);
}
else {
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
schedule();
time_remain = 0;
}
dkprintf("futex_wait_queue_me(): tid: %d woken up\n", cpu_local_var(current)->tid);
}
/* This does not need to be serialized */
cpu_local_var(current)->ftn->status = PS_RUNNING;
cpu_local_var(current)->status = PS_RUNNING;
return time_remain;
}
@ -775,6 +797,11 @@ retry:
if (timeout && !time_remain)
goto out_put_key;
if (hassigpending(cpu_local_var(current))) {
ret = -EINTR;
goto out_put_key;
}
/* RIKEN: no signals */
put_futex_key(fshared, &q.key);
goto retry;
@ -786,17 +813,10 @@ out:
}
int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
uint32_t *uaddr2, uint32_t val2, uint32_t val3)
uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared)
{
int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
int fshared = 0;
/* RIKEN: Assume address space private futexes.
if (!(op & FUTEX_PRIVATE_FLAG)) {
fshared = 1;
}
*/
clockrt = op & FUTEX_CLOCK_REALTIME;
if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
@ -817,8 +837,7 @@ int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
0);
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);

View File

@ -30,6 +30,7 @@
#include <mman.h>
#include <init.h>
#include <kmalloc.h>
#include <sysfs.h>
//#define DEBUG_PRINT_HOST
@ -39,11 +40,11 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
void check_mapping_for_proc(struct process *proc, unsigned long addr)
void check_mapping_for_proc(struct thread *thread, unsigned long addr)
{
unsigned long __phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table, (void*)addr, &__phys)) {
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table, (void*)addr, &__phys)) {
kprintf("check_map: no mapping for 0x%lX\n", addr);
}
else {
@ -60,7 +61,7 @@ void check_mapping_for_proc(struct process *proc, unsigned long addr)
* NOTE: if args, args_len, envs, envs_len are zero,
* the function constructs them based on the descriptor
*/
int prepare_process_ranges_args_envs(struct process *proc,
int prepare_process_ranges_args_envs(struct thread *thread,
struct program_load_desc *pn,
struct program_load_desc *p,
enum ihk_mc_pt_attribute attr,
@ -80,15 +81,21 @@ int prepare_process_ranges_args_envs(struct process *proc,
unsigned long flags;
uintptr_t interp_obase = -1;
uintptr_t interp_nbase = -1;
size_t map_size;
struct process *proc = thread->proc;
struct process_vm *vm = proc->vm;
struct address_space *as = vm->address_space;
long aout_base;
int error;
n = p->num_sections;
aout_base = (pn->reloc)? vm->region.map_end: 0;
for (i = 0; i < n; i++) {
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
interp_obase = pn->sections[i].vaddr;
interp_obase -= (interp_obase % pn->interp_align);
interp_nbase = proc->vm->region.map_start;
interp_nbase = vm->region.map_end;
interp_nbase = (interp_nbase + pn->interp_align - 1)
& ~(pn->interp_align - 1);
}
@ -98,6 +105,10 @@ int prepare_process_ranges_args_envs(struct process *proc,
pn->sections[i].vaddr += interp_nbase;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
else{
pn->sections[i].vaddr += aout_base;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
s = (pn->sections[i].vaddr) & PAGE_MASK;
e = (pn->sections[i].vaddr + pn->sections[i].len
+ PAGE_SIZE - 1) & PAGE_MASK;
@ -113,7 +124,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
}
up = virt_to_phys(up_v);
if (add_process_memory_range(proc, s, e, up, flags, NULL, 0) != 0) {
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
PAGE_SHIFT) != 0) {
ihk_mc_free_pages(up_v, range_npages);
kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err;
@ -122,14 +134,14 @@ int prepare_process_ranges_args_envs(struct process *proc,
{
void *_virt = (void *)s;
unsigned long _phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &_phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
}
for (_virt = (void *)s + PAGE_SIZE;
(unsigned long)_virt < e; _virt += PAGE_SIZE) {
unsigned long __phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &__phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
panic("mapping");
@ -148,23 +160,27 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* TODO: Maybe we need flag */
if (pn->sections[i].interp) {
proc->vm->region.map_end = e;
vm->region.map_end = e;
}
else if (i == 0) {
proc->vm->region.text_start = s;
proc->vm->region.text_end = e;
vm->region.text_start = s;
vm->region.text_end = e;
}
else if (i == 1) {
proc->vm->region.data_start = s;
proc->vm->region.data_end = e;
vm->region.data_start = s;
vm->region.data_end = e;
}
else {
proc->vm->region.data_start =
(s < proc->vm->region.data_start ?
s : proc->vm->region.data_start);
proc->vm->region.data_end =
(e > proc->vm->region.data_end ?
e : proc->vm->region.data_end);
vm->region.data_start =
(s < vm->region.data_start ?
s : vm->region.data_start);
vm->region.data_end =
(e > vm->region.data_end ?
e : vm->region.data_end);
}
if (aout_base) {
vm->region.map_end = e;
}
}
@ -172,17 +188,22 @@ int prepare_process_ranges_args_envs(struct process *proc,
pn->entry -= interp_obase;
pn->entry += interp_nbase;
p->entry = pn->entry;
ihk_mc_modify_user_context(proc->uctx, IHK_UCR_PROGRAM_COUNTER,
pn->entry);
ihk_mc_modify_user_context(thread->uctx,
IHK_UCR_PROGRAM_COUNTER,
pn->entry);
}
proc->vm->region.brk_start = proc->vm->region.brk_end =
proc->vm->region.data_end;
if (aout_base) {
pn->at_phdr += aout_base;
pn->at_entry += aout_base;
}
vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
/* Map, copy and update args and envs */
flags = VR_PROT_READ | VR_PROT_WRITE;
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
addr = proc->vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
addr = vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT;
if((args_envs = ihk_mc_alloc_pages(ARGENV_PAGE_COUNT, IHK_MC_AP_NOWAIT)) == NULL){
@ -191,8 +212,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
}
args_envs_p = virt_to_phys(args_envs);
if(add_process_memory_range(proc, addr, e, args_envs_p,
flags, NULL, 0) != 0){
if(add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0, PAGE_SHIFT) != 0){
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
kprintf("ERROR: adding memory range for args/envs\n");
goto err;
@ -205,7 +226,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* Only map remote address if it wasn't specified as an argument */
if (!args) {
// Map in remote physical addr of args and copy it
args_envs_npages = (p->args_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
map_size = ((uintptr_t)p->args & (PAGE_SIZE - 1)) + p->args_len;
args_envs_npages = (map_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
dkprintf("args_envs_npages: %d\n", args_envs_npages);
args_envs_rp = ihk_mc_map_memory(NULL,
(unsigned long)p->args, p->args_len);
@ -222,9 +244,9 @@ int prepare_process_ranges_args_envs(struct process *proc,
p->args_len = args_len;
}
dkprintf("args copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("args copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs, args_envs_r, p->args_len + 8);
memcpy_long(args_envs, args_envs_r, p->args_len + sizeof(long) - 1);
/* Only unmap remote address if it wasn't specified as an argument */
if (!args) {
@ -238,7 +260,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* Only map remote address if it wasn't specified as an argument */
if (!envs) {
// Map in remote physical addr of envs and copy it after args
args_envs_npages = (p->envs_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
map_size = ((uintptr_t)p->envs & (PAGE_SIZE - 1)) + p->envs_len;
args_envs_npages = (map_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
dkprintf("args_envs_npages: %d\n", args_envs_npages);
args_envs_rp = ihk_mc_map_memory(NULL, (unsigned long)p->envs,
p->envs_len);
@ -256,9 +279,9 @@ int prepare_process_ranges_args_envs(struct process *proc,
p->envs_len = envs_len;
}
dkprintf("envs copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("envs copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + 8);
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + sizeof(long) - 1);
/* Only map remote address if it wasn't specified as an argument */
if (!envs) {
@ -268,10 +291,10 @@ int prepare_process_ranges_args_envs(struct process *proc,
flush_tlb();
// Update variables
argc = *((int*)(args_envs));
argc = *((long *)(args_envs));
dkprintf("argc: %d\n", argc);
argv = (char **)(args_envs + (sizeof(int)));
argv = (char **)(args_envs + (sizeof(long)));
if(proc->saved_cmdline){
kfree(proc->saved_cmdline);
proc->saved_cmdline_len = 0;
@ -288,24 +311,35 @@ int prepare_process_ranges_args_envs(struct process *proc,
*a = (char *)addr + (unsigned long)*a; // Process' address space!
}
envc = *((int*)(args_envs + p->args_len));
envc = *((long *)(args_envs + p->args_len));
dkprintf("envc: %d\n", envc);
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
while (*env) {
char **_env = env;
//dkprintf("%s\n", args_envs + p->args_len + (unsigned long)*env);
*env = (char *)addr + p->args_len + (unsigned long)*env;
env = ++_env;
}
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
dkprintf("env OK\n");
p->rprocess = (unsigned long)proc;
p->rpgtable = virt_to_phys(proc->vm->page_table);
if (pn->enable_vdso) {
error = arch_map_vdso(vm);
if (error) {
kprintf("ERROR: mapping vdso pages. %d\n", error);
goto err;
}
}
else {
vm->vdso_addr = NULL;
}
if (init_process_stack(proc, pn, argc, argv, envc, env) != 0) {
p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table);
if (init_process_stack(thread, pn, argc, argv, envc, env) != 0) {
goto err;
}
@ -324,7 +358,9 @@ static int process_msg_prepare_process(unsigned long rphys)
unsigned long phys, sz;
struct program_load_desc *p, *pn;
int npages, n;
struct thread *thread;
struct process *proc;
struct process_vm *vm;
enum ihk_mc_pt_attribute attr;
attr = PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER;
@ -340,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys)
}
n = p->num_sections;
if (n > 16) {
kprintf("%s: ERROR: more ELF sections than 16??\n",
__FUNCTION__);
return -ENOMEM;
}
dkprintf("# of sections: %d\n", n);
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){
if((pn = kmalloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n,
IHK_MC_AP_NOWAIT)) == NULL){
ihk_mc_unmap_virtual(p, npages, 0);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -351,43 +393,58 @@ static int process_msg_prepare_process(unsigned long rphys)
memcpy_long(pn, p, sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n);
if((proc = create_process(p->entry)) == NULL){
ihk_mc_free(pn);
if((thread = create_thread(p->entry)) == NULL){
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
}
proc->ftn->pid = pn->pid;
proc->ftn->pgid = pn->pgid;
proc = thread->proc;
vm = thread->vm;
proc->ftn->ruid = pn->cred[0];
proc->ftn->euid = pn->cred[1];
proc->ftn->suid = pn->cred[2];
proc->ftn->fsuid = pn->cred[3];
proc->ftn->rgid = pn->cred[4];
proc->ftn->egid = pn->cred[5];
proc->ftn->sgid = pn->cred[6];
proc->ftn->fsgid = pn->cred[7];
proc->pid = pn->pid;
proc->vm->address_space->pids[0] = pn->pid;
proc->pgid = pn->pgid;
proc->ruid = pn->cred[0];
proc->euid = pn->cred[1];
proc->suid = pn->cred[2];
proc->fsuid = pn->cred[3];
proc->rgid = pn->cred[4];
proc->egid = pn->cred[5];
proc->sgid = pn->cred[6];
proc->fsgid = pn->cred[7];
proc->termsig = SIGCHLD;
proc->vm->region.user_start = pn->user_start;
proc->vm->region.user_end = pn->user_end;
proc->vm->region.map_start = (USER_END / 3) & LARGE_PAGE_MASK;
proc->vm->region.map_end = proc->vm->region.map_start;
vm->region.user_start = pn->user_start;
vm->region.user_end = pn->user_end;
if(vm->region.user_end > USER_END)
vm->region.user_end = USER_END;
if(vm->region.user_start != 0UL ||
vm->region.user_end < TASK_UNMAPPED_BASE){
vm->region.map_start =
(vm->region.user_start +
(vm->region.user_end - vm->region.user_start) / 3) &
LARGE_PAGE_MASK;
}
else{
vm->region.map_start = TASK_UNMAPPED_BASE;
}
vm->region.map_end = vm->region.map_start;
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
/* TODO: Clear it at the proper timing */
cpu_local_var(scp).post_idx = 0;
if (prepare_process_ranges_args_envs(proc, pn, p, attr,
if (prepare_process_ranges_args_envs(thread, pn, p, attr,
NULL, 0, NULL, 0) != 0) {
kprintf("error: preparing process ranges, args, envs, stack\n");
goto err;
}
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->ftn->pid,
proc->vm->page_table);
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
vm->address_space->page_table);
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
@ -395,17 +452,16 @@ static int process_msg_prepare_process(unsigned long rphys)
return 0;
err:
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
free_process_memory(proc);
destroy_process(proc);
destroy_thread(thread);
return -ENOMEM;
}
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0;
@ -415,7 +471,7 @@ static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_para
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
{
struct ikc_scd_init_param *param = (void *)pphys;
struct ikc_scd_init_param *param = phys_to_virt(pphys);
struct syscall_params *lparam;
enum ihk_mc_pt_attribute attr;
@ -473,13 +529,8 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
ihk_ikc_send(c, packet, 0);
}
extern unsigned long do_kill(int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct process *proc, int mode, int newcpuid, int oldcpuid);
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid);
extern void debug_log(long);
@ -489,6 +540,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread;
struct process *proc;
struct mcctrl_signal {
int cond;
@ -499,22 +552,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
} *sp, info;
unsigned long pp;
int cpuid;
int ret = 0;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_PREPARE_PROCESS:
if (find_command_line("memdebug")) {
memcheckall();
if (runcount)
freecheck(runcount);
runcount++;
}
if((rc = process_msg_prepare_process(packet->arg)) == 0){
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
pckt.err = 0;
@ -527,23 +575,50 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
kprintf("No CPU available\n");
return -1;
ret = -1;
break;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
proc = (struct process *)packet->arg;
settid(proc, 0, cpuid, -1);
proc->ftn->status = PS_RUNNING;
runq_add_proc(proc, cpuid);
thread = (struct thread *)packet->arg;
proc = thread->proc;
thread->tid = proc->pid;
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
chain_process(proc);
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct process *)packet->arg;
return 0;
//cpu_local_var(next) = (struct thread *)packet->arg;
ret = 0;
break;
/*
* Used for syscall offload reply message to explicitly schedule in
* the waiting thread
*/
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
thread = find_thread(0, packet->ttid, &lock);
if (!thread) {
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
__FUNCTION__, packet->ttid);
ret = -EINVAL;
break;
}
thread_unlock(thread, &lock);
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
__FUNCTION__, packet->ttid);
waitq_wakeup(&thread->scd_wq);
ret = 0;
break;
case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
@ -556,22 +631,49 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
rc = do_kill(info.pid, info.tid, info.sig, &info.info, 0);
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0;
ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid);
return 0;
ret = 0;
break;
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
case SCD_MSG_SYSFS_REQ_RELEASE:
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
ret = 0;
break;
}
return 0;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return ret;
}
void init_host_syscall_channel(void)

View File

@ -1,6 +1,8 @@
#ifndef _LINUX_AUXVEC_H
#define _LINUX_AUXVEC_H
#include <arch/auxvec.h>
/* Symbolic values for the entries in the auxiliary table
put on the initial stack */
#define AT_NULL 0 /* end of vector */

View File

@ -19,11 +19,13 @@
* CPU Local Storage (cls)
*/
struct malloc_header {
unsigned int check;
struct kmalloc_header {
unsigned int front_magic;
unsigned int cpu_id;
struct malloc_header *next;
unsigned long size;
struct list_head list;
int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
};
#include <ihk/lock.h>
@ -38,15 +40,18 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var {
/* malloc */
struct malloc_header free_list;
ihk_spinlock_t free_list_lock;
struct list_head free_list;
struct list_head remote_free_list;
ihk_spinlock_t remote_free_list_lock;
struct process idle;
struct fork_tree_node idle_ftn;
struct thread idle;
struct process idle_proc;
struct process_vm idle_vm;
struct address_space idle_asp;
ihk_spinlock_t runq_lock;
struct process *current;
unsigned long runq_irqstate;
struct thread *current;
struct list_head runq;
size_t runq_len;
@ -57,6 +62,7 @@ struct cpu_local_var {
struct ihk_ikc_channel_desc *syscall_channel2;
struct syscall_params scp2;
struct ikc_scd_init_param iip2;
struct resource_set *resource_set;
int status;
int fs;
@ -67,6 +73,10 @@ struct cpu_local_var {
ihk_spinlock_t migq_lock;
struct list_head migq;
int in_interrupt;
int no_preempt;
int timer_enabled;
int kmalloc_initialized;
} __attribute__((aligned(64)));

View File

@ -99,6 +99,8 @@
#ifdef __KERNEL__
#define __user
/* We don't deal with uaccess at the moment, because x86 can access
* userspace directly, we rely on glibc and the app developers.
*/
@ -106,42 +108,14 @@
#include <arch/uaccess.h>
#endif
#include <asm.h>
#include <errno.h>
#define __user
#include <arch-futex.h>
#if 0
#include <arch/processor.h>
#include <arch/system.h>
#endif
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
{
int op = (encoded_op >> 28) & 7;
@ -206,28 +180,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
return ret;
}
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif // __KERNEL__
#endif // _ASM_X86_FUTEX_H
@ -241,13 +193,11 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
struct process_vm;
union futex_key {
#if 0
struct {
unsigned long pgoff;
struct inode *inode;
void *phys;
int offset;
} shared;
#endif
struct {
unsigned long address;
struct process_vm *mm;
@ -261,6 +211,7 @@ union futex_key {
};
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
#define FUT_OFF_MMSHARED 2
extern int futex_init(void);
@ -272,7 +223,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);

View File

@ -0,0 +1,23 @@
/**
* \file rlimit.h
* License details are found in the file LICENSE.
* \brief
* Kinds of resource limit
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef __GENERIC_RLIMIT_H
#define __GENERIC_RLIMIT_H
typedef uint64_t rlim_t;
struct rlimit {
rlim_t rlim_cur; /* Soft limit */
rlim_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#endif

View File

@ -14,7 +14,7 @@
#define INIT_H
extern void arch_init(void);
extern void kmsg_init(void);
extern void kmsg_init(int);
extern void mem_init(void);
extern void ikc_master_init(void);
extern void ap_init(void);
@ -28,6 +28,7 @@ extern void init_host_syscall_channel(void);
extern void init_host_syscall_channel2(void);
extern void sched_init(void);
extern void pc_ap_init(void);
extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name);

View File

@ -14,19 +14,28 @@
#define __HEADER_KMALLOC_H
#include <ihk/mm.h>
#include <cls.h>
#define kmalloc(size, flag) _kmalloc(size, flag, __FILE__, __LINE__)
void panic(const char *);
int kprintf(const char *format, ...);
#define kmalloc(size, flag) ({\
void *r = _kmalloc(size, flag, __FILE__, __LINE__);\
if(r == NULL){\
kprintf("kmalloc: out of memory %s:%d no_preempt=%d\n", __FILE__, __LINE__, cpu_local_var(no_preempt)); \
}\
r;\
})
#define kfree(ptr) _kfree(ptr, __FILE__, __LINE__)
#define memcheck(ptr, msg) _memcheck(ptr, msg, __FILE__, __LINE__, 0)
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void __kfree(void *ptr);
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
void ___kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free);
int memcheckall();
int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#endif

View File

@ -16,6 +16,6 @@
void kputs(char *buf);
int kprintf(const char *format, ...);
void kmsg_init(void);
void kmsg_init(int);
#endif

View File

@ -92,7 +92,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);
extern long

View File

@ -31,6 +31,7 @@ enum {
/* for memobj.flags */
MF_HAS_PAGER = 0x0001,
MF_SHMDT_OK = 0x0002,
MF_IS_REMOVABLE = 0x0004,
};
struct memobj {
@ -46,6 +47,7 @@ typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, u
typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align);
typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_lookup_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag);
struct memobj_ops {
memobj_release_func_t * release;
@ -54,6 +56,7 @@ struct memobj_ops {
memobj_copy_page_func_t * copy_page;
memobj_flush_page_func_t * flush_page;
memobj_invalidate_page_func_t * invalidate_page;
memobj_lookup_page_func_t * lookup_page;
};
static inline void memobj_release(struct memobj *obj)
@ -105,6 +108,15 @@ static inline int memobj_invalidate_page(struct memobj *obj, uintptr_t phys,
return 0;
}
static inline int memobj_lookup_page(struct memobj *obj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
if (obj->ops->lookup_page) {
return (*obj->ops->lookup_page)(obj, off, p2align, physp, pflag);
}
return -ENXIO;
}
static inline void memobj_lock(struct memobj *obj)
{
ihk_mc_spinlock_lock_noirq(&obj->lock);
@ -120,10 +132,16 @@ static inline int memobj_has_pager(struct memobj *obj)
return !!(obj->flags & MF_HAS_PAGER);
}
static inline int memobj_is_removable(struct memobj *obj)
{
return !!(obj->flags & MF_IS_REMOVABLE);
}
int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
struct shmid_ds;
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
int zeroobj_create(struct memobj **objp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags);
#endif /* HEADER_MEMOBJ_H */

View File

@ -17,8 +17,9 @@
struct page {
struct list_head list;
struct list_head hash;
uint8_t mode;
uint8_t padding[3];
uint64_t phys;
ihk_atomic_t count;
off_t offset;
};
@ -38,9 +39,8 @@ enum page_mode {
struct page *phys_to_page(uintptr_t phys);
uintptr_t page_to_phys(struct page *page);
int page_unmap(struct page *page);
struct page *phys_to_page_insert_hash(uint64_t phys);
void *allocate_pages(int npages, enum ihk_mc_ap_flag flag);
void free_pages(void *va, int npages);
void begin_free_pages_pending(void);
void finish_free_pages_pending(void);
@ -66,4 +66,6 @@ static inline int page_is_multi_mapped(struct page *page)
return (ihk_atomic_read(&page->count) > 1);
}
/* Should we take page faults on ANONYMOUS mappings? */
extern int anon_on_demand;
#endif

View File

@ -22,6 +22,7 @@
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#include <bitops.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
@ -29,6 +30,7 @@
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
#define VR_DONTFORK 0x800
#define VR_DEMAND_PAGING 0x1000
#define VR_PRIVATE 0x2000
#define VR_LOCKED 0x4000
@ -51,6 +53,7 @@
#define VRFLAG_PROT_TO_MAXPROT(vrflag) (((vrflag) & VR_PROT_MASK) << 4)
#define VRFLAG_MAXPROT_TO_PROT(vrflag) (((vrflag) & VR_MAXPROT_MASK) >> 4)
// struct process.status, struct thread.status
#define PS_RUNNING 0x1
#define PS_INTERRUPTIBLE 0x2
#define PS_UNINTERRUPTIBLE 0x4
@ -58,15 +61,19 @@
#define PS_EXITED 0x10
#define PS_STOPPED 0x20
#define PS_TRACED 0x40 /* Set to "not running" by a ptrace related event */
#define PS_STOPPING 0x80
#define PS_TRACING 0x100
#define PS_NORMAL (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)
// struct process.ptrace
#define PT_TRACED 0x80 /* The process is ptraced */
#define PT_TRACE_EXEC 0x100 /* Trace execve(2) */
#define PT_TRACE_SYSCALL_ENTER 0x200 /* Trace syscall enter */
#define PT_TRACE_SYSCALL_EXIT 0x400 /* Trace syscall exit */
#define PT_TRACE_SYSCALL_MASK (PT_TRACE_SYSCALL_ENTER | PT_TRACE_SYSCALL_EXIT)
// ptrace(2) request
#define PTRACE_TRACEME 0
#define PTRACE_PEEKTEXT 1
#define PTRACE_PEEKDATA 2
@ -95,6 +102,7 @@
#define PTRACE_GETREGSET 0x4204
#define PTRACE_SETREGSET 0x4205
// ptrace(2) options
#define PTRACE_O_TRACESYSGOOD 1
#define PTRACE_O_TRACEFORK 2
#define PTRACE_O_TRACEVFORK 4
@ -104,6 +112,7 @@
#define PTRACE_O_TRACEEXIT 0x40
#define PTRACE_O_MASK 0x7f
// ptrace(2) events
#define PTRACE_EVENT_FORK 1
#define PTRACE_EVENT_VFORK 2
#define PTRACE_EVENT_CLONE 3
@ -153,11 +162,136 @@
#endif
#define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25
#define KERNEL_STACK_NR_PAGES 32
#define NOPHYS ((uintptr_t)-1)
#define PROCESS_NUMA_MASK_BITS 64
/*
* Both the MPOL_* mempolicy mode and the MPOL_F_* optional mode flags are
* passed by the user to either set_mempolicy() or mbind() in an 'int' actual.
* The MPOL_MODE_FLAGS macro determines the legal set of optional mode flags.
*/
/* Policies */
enum {
MPOL_DEFAULT,
MPOL_PREFERRED,
MPOL_BIND,
MPOL_INTERLEAVE,
MPOL_LOCAL,
MPOL_MAX, /* always last member of enum */
};
enum mpol_rebind_step {
MPOL_REBIND_ONCE, /* do rebind work at once(not by two step) */
MPOL_REBIND_STEP1, /* first step(set all the newly nodes) */
MPOL_REBIND_STEP2, /* second step(clean all the disallowed nodes)*/
MPOL_REBIND_NSTEP,
};
/* Flags for set_mempolicy */
#define MPOL_F_STATIC_NODES (1 << 15)
#define MPOL_F_RELATIVE_NODES (1 << 14)
/*
* MPOL_MODE_FLAGS is the union of all possible optional mode flags passed to
* either set_mempolicy() or mbind().
*/
#define MPOL_MODE_FLAGS (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES)
/* Flags for get_mempolicy */
#define MPOL_F_NODE (1<<0) /* return next IL mode instead of node mask */
#define MPOL_F_ADDR (1<<1) /* look up vma using address */
#define MPOL_F_MEMS_ALLOWED (1<<2) /* return allowed memories */
/* Flags for mbind */
#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */
#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform
to policy */
#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to policy */
#define MPOL_MF_LAZY (1<<3) /* Modifies '_MOVE: lazy migrate on fault */
#define MPOL_MF_INTERNAL (1<<4) /* Internal flags start here */
#define MPOL_MF_VALID (MPOL_MF_STRICT | \
MPOL_MF_MOVE | \
MPOL_MF_MOVE_ALL)
/*
* Internal flags that share the struct mempolicy flags word with
* "mode flags". These flags are allocated from bit 0 up, as they
* are never OR'ed into the mode in mempolicy API arguments.
*/
#define MPOL_F_SHARED (1 << 0) /* identify shared policies */
#define MPOL_F_LOCAL (1 << 1) /* preferred local allocation */
#define MPOL_F_REBINDING (1 << 2) /* identify policies in rebinding */
#define MPOL_F_MOF (1 << 3) /* this policy wants migrate on fault */
#define MPOL_F_MORON (1 << 4) /* Migrate On pte_numa Reference On Node */
#include <waitq.h>
#include <futex.h>
struct resource_set;
struct process_hash;
struct thread_hash;
struct address_space;
struct process;
struct thread;
struct process_vm;
struct vm_regions;
struct vm_range;
#define HASH_SIZE 73
struct resource_set {
struct list_head list;
char *path;
struct process_hash *process_hash;
struct thread_hash *thread_hash;
struct list_head phys_mem_list;
mcs_rwlock_lock_t phys_mem_lock;
cpu_set_t cpu_set;
mcs_rwlock_lock_t cpu_set_lock;
struct process *pid1;
};
extern struct list_head resource_set_list;
extern mcs_rwlock_lock_t resource_set_lock;
struct process_hash {
struct list_head list[HASH_SIZE];
mcs_rwlock_lock_t lock[HASH_SIZE];
};
static inline int
process_hash(int pid)
{
return pid % HASH_SIZE;
}
static inline int
thread_hash(int tid)
{
return tid % HASH_SIZE;
}
struct thread_hash {
struct list_head list[HASH_SIZE];
mcs_rwlock_lock_t lock[HASH_SIZE];
};
struct address_space {
struct page_table *page_table;
void *opt;
void (*free_cb)(struct address_space *, void *);
ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int nslots;
int pids[];
};
struct user_fpregs_struct
{
unsigned short cwd;
@ -223,7 +357,7 @@ struct user
unsigned long int u_debugreg [8];
};
#define AUXV_LEN 16
#define AUXV_LEN 18
struct vm_range {
struct list_head list;
@ -231,9 +365,12 @@ struct vm_range {
unsigned long flag;
struct memobj *memobj;
off_t objoff;
int pgshift; /* page size. 0 means THP */
int padding;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
unsigned long data_start, data_end;
unsigned long brk_start, brk_end;
@ -244,19 +381,27 @@ struct vm_regions {
struct process_vm;
struct sigfd {
struct sigfd *next;
struct mckfd {
struct mckfd *next;
int fd;
__sigset_t mask;
int sig_no;
long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*fcntl_cb)(struct mckfd *, ihk_mc_user_context_t *);
};
#define SFD_CLOEXEC 02000000
#define SFD_NONBLOCK 04000
struct sig_handler {
struct sig_common {
ihk_spinlock_t lock;
ihk_atomic_t use;
struct sigfd *sigfd;
ihk_atomic_t use;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
};
struct sig_pending {
@ -266,27 +411,68 @@ struct sig_pending {
int ptracecont;
};
struct sig_shared {
ihk_spinlock_t lock;
ihk_atomic_t use;
struct list_head sigpending;
};
typedef void pgio_func_t(void *arg);
struct mcexec_tid {
int tid;
struct thread *thread;
};
/* Represents a node in the process fork tree, it may exist even after the
* corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a
* special "init" process */
struct fork_tree_node {
ihk_spinlock_t lock;
ihk_atomic_t refcount;
int exit_status;
int status;
struct process {
struct list_head hash_list;
mcs_rwlock_lock_t update_lock; // lock for parent, status, cpu time...
struct process *owner;
// process vm
struct process_vm *vm;
// threads and children
struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* TID set of proxy process */
struct mcexec_tid *tids;
int nr_tids;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
struct process *parent;
struct process *ppid_parent;
struct list_head children_list;
struct list_head ptraced_children_list;
mcs_rwlock_lock_t children_lock; // lock for children_list and ptraced_children_list
struct list_head siblings_list; // lock parent
struct list_head ptraced_siblings_list; // lock ppid_parent
ihk_atomic_t refcount;
// process status and exit status
int status; // PS_RUNNING -> PS_EXITED -> PS_ZOMBIE
// | ^ ^
// | |---+ |
// V | |
// PS_STOPPING-)---+
// (PS_TRACING)| |
// | | |
// V +---- |
// PS_STOPPED -----+
// (PS_TRACED)
int exit_status;
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
might divert while the threads are exiting by group_exit(). */
int group_exit_status;
/* Manage ptraced processes in the separate list to make it easy to
restore the orginal parent child relationship when
performing PTRACE_DETACH */
struct waitq waitpid_q;
// process info and credentials etc.
int pid;
int tid;
int pgid;
int ruid;
int euid;
@ -296,50 +482,58 @@ struct fork_tree_node {
int egid;
int sgid;
int fsgid;
struct fork_tree_node *parent;
struct list_head children;
struct list_head siblings_list;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
struct fork_tree_node *ppid_parent;
int execed;
int nohost;
int nowait;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long saved_auxv[AUXV_LEN];
char *saved_cmdline;
long saved_cmdline_len;
/* Manage ptraced processes in the separate list to make it easy to
restore the orginal parent child relationship when
performing PTRACE_DETACH */
struct list_head ptrace_children;
struct list_head ptrace_siblings_list;
/* Store ptrace flags.
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
* Other bits are for inner use of the McKernel.
*/
int ptrace;
struct waitq waitpid_q;
/* Store ptrace event message.
* PTRACE_O_xxx will store event message here.
* PTRACE_GETEVENTMSG will get from here.
*/
unsigned long ptrace_eventmsg;
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
might divert while the threads are exiting by group_exit(). */
int group_exit_status;
/* Store event related to signal. For example,
it represents that the proceess has been resumed by SIGCONT. */
int signal_flags;
/* Store ptrace flags.
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
* Other bits are for inner use of the McKernel.
*/
int ptrace;
/* Store signal sent to parent when the process terminates. */
int termsig;
/* Store ptrace event message.
PTRACE_O_xxx will store event message here.
PTRACE_GETEVENTMSG will get from here.
*/
unsigned long ptrace_eventmsg;
ihk_spinlock_t mckfd_lock;
struct mckfd *mckfd;
/* Store event related to signal. For example,
it represents that the proceess has been resumed by SIGCONT. */
int signal_flags;
// cpu time (summary)
struct timespec stime;
struct timespec utime;
/* Store signal sent to parent when the process terminates. */
int termsig;
// cpu time (children)
struct timespec stime_children;
struct timespec utime_children;
long maxrss;
long maxrss_children;
// perf_event
int perf_status;
#define PP_NONE 0
#define PP_RESET 1
#define PP_COUNT 2
#define PP_STOP 3
struct mc_perf_event *monitoring_event;
};
void hold_fork_tree_node(struct fork_tree_node *ftn);
void release_fork_tree_node(struct fork_tree_node *ftn);
void hold_thread(struct thread *ftn);
void release_thread(struct thread *ftn);
/*
* Scheduling policies
@ -364,100 +558,150 @@ struct sched_param {
int sched_priority;
};
struct process {
struct thread {
struct list_head hash_list;
// thread info
int cpu_id;
int tid;
int status; // PS_RUNNING -> PS_EXITED
// | ^ ^
// | | |
// V | |
// PS_STOPPED------+
// PS_TRACED
// PS_INTERRPUTIBLE
// PS_UNINTERRUPTIBLE
ihk_atomic_t refcount;
// process vm
struct process_vm *vm;
// context
ihk_mc_kernel_context_t ctx;
ihk_mc_user_context_t *uctx;
// sibling
struct process *proc;
struct list_head siblings_list; // lock process
// Runqueue list entry
struct list_head sched_list;
struct list_head sched_list; // lock cls
int sched_policy;
struct sched_param sched_param;
ihk_spinlock_t spin_sleep_lock;
int spin_sleep;
struct thread {
int *clear_child_tid;
unsigned long tlsblock_base, tlsblock_limit;
} thread;
ihk_atomic_t refcount;
volatile int sigevent;
int nohost;
int execed;
int *clear_child_tid;
unsigned long tlsblock_base, tlsblock_limit;
// thread info
cpu_set_t cpu_set;
fp_regs_struct *fp_regs;
int in_syscall_offload;
// signal
struct sig_common *sigcommon;
sigset_t sigmask;
stack_t sigstack;
ihk_spinlock_t sigpendinglock;
struct list_head sigpending;
struct sig_shared *sigshared;
struct sig_handler *sighandler;
ihk_spinlock_t sigpendinglock;
volatile int sigevent;
struct rlimit rlimit[MCK_RLIM_MAX];
// gpio
pgio_func_t *pgio_fp;
void *pgio_arg;
struct fork_tree_node *ftn;
cpu_set_t cpu_set;
unsigned long saved_auxv[AUXV_LEN];
// for ptrace
unsigned long *ptrace_debugreg; /* debug registers for ptrace */
struct sig_pending *ptrace_recvsig;
struct sig_pending *ptrace_sendsig;
fp_regs_struct *fp_regs;
char *saved_cmdline;
long saved_cmdline_len;
// cpu time
struct timespec stime;
struct timespec utime;
struct timespec btime;
int times_update;
int in_kernel;
// interval timers
int itimer_enabled;
struct itimerval itimer_virtual;
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
/* Syscall offload wait queue head */
struct waitq scd_wq;
};
struct process_vm {
ihk_atomic_t refcount;
struct page_table *page_table;
struct address_space *address_space;
struct list_head vm_range_list;
struct vm_regions region;
struct process *owner_process; /* process that reside on the same page */
struct process *proc; /* process that reside on the same page */
void *opt;
void (*free_cb)(struct process_vm *, void *);
void *vdso_addr;
void *vvar_addr;
ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock;
ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock;
// to protect the followings:
// 1. addition of process "memory range" (extend_process_region, add_process_memory_range)
// 2. addition of process page table (allocate_pages, update_process_page_table)
// note that physical memory allocator (ihk_mc_alloc_pages, ihk_pagealloc_alloc)
// is protected by its own lock (see ihk/manycore/generic/page_alloc.c)
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
ihk_atomic_t refcount;
int exiting;
long currss;
DECLARE_BITMAP(numa_mask, PROCESS_NUMA_MASK_BITS);
int numa_mem_policy;
};
static inline int has_cap_ipc_lock(struct thread *th)
{
/* CAP_IPC_LOCK (= 14) */
return !(th->proc->euid);
}
struct process *create_process(unsigned long user_pc);
struct process *clone_process(struct process *org, unsigned long pc,
static inline int has_cap_sys_admin(struct thread *th)
{
/* CAP_SYS_ADMIN (= 21) */
return !(th->proc->euid);
}
void hold_address_space(struct address_space *);
void release_address_space(struct address_space *);
struct thread *create_thread(unsigned long user_pc);
struct thread *clone_thread(struct thread *org, unsigned long pc,
unsigned long sp, int clone_flags);
void destroy_process(struct process *proc);
void hold_process(struct process *proc);
void release_process(struct process *proc);
void flush_process_memory(struct process *proc);
void free_process_memory(struct process *proc);
void free_process_memory_ranges(struct process *proc);
int populate_process_memory(struct process *proc, void *start, size_t len);
void destroy_thread(struct thread *thread);
void hold_thread(struct thread *thread);
void release_thread(struct thread *thread);
void flush_process_memory(struct process_vm *vm);
void hold_process_vm(struct process_vm *vm);
void release_process_vm(struct process_vm *vm);
void hold_process(struct process *);
void release_process(struct process *);
void free_process_memory_ranges(struct process_vm *vm);
int populate_process_memory(struct process_vm *vm, void *start, size_t len);
int add_process_memory_range(struct process *process,
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t objoff);
int remove_process_memory_range(struct process *process, unsigned long start,
struct memobj *memobj, off_t objoff, int pgshift);
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process *process,
int split_process_memory_range(struct process_vm *vm,
struct vm_range *range, uintptr_t addr, struct vm_range **splitp);
int join_process_memory_range(struct process *process, struct vm_range *surviving,
int join_process_memory_range(struct process_vm *vm, struct vm_range *surviving,
struct vm_range *merging);
int change_prot_process_memory_range(
struct process *process, struct vm_range *range,
struct process_vm *vm, struct vm_range *range,
unsigned long newflag);
int remap_process_memory_range(struct process_vm *vm, struct vm_range *range,
uintptr_t start, uintptr_t end, off_t off);
@ -476,31 +720,41 @@ int extend_up_process_memory_range(struct process_vm *vm,
int page_fault_process_vm(struct process_vm *fault_vm, void *fault_addr,
uint64_t reason);
int remove_process_region(struct process *proc,
int remove_process_region(struct process_vm *vm,
unsigned long start, unsigned long end);
struct program_load_desc;
int init_process_stack(struct process *process, struct program_load_desc *pn,
int init_process_stack(struct thread *thread, struct program_load_desc *pn,
int argc, char **argv,
int envc, char **env);
unsigned long extend_process_region(struct process *proc,
unsigned long extend_process_region(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long address, unsigned long flag);
extern enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
void schedule(void);
void runq_add_proc(struct process *proc, int cpu_id);
void runq_del_proc(struct process *proc, int cpu_id);
int sched_wakeup_process(struct process *proc, int valid_states);
void runq_add_thread(struct thread *thread, int cpu_id);
void runq_del_thread(struct thread *thread, int cpu_id);
int sched_wakeup_thread(struct thread *thread, int valid_states);
void sched_request_migrate(int cpu_id, struct process *proc);
void sched_request_migrate(int cpu_id, struct thread *thread);
void check_need_resched(void);
void cpu_set(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock);
void cpu_clear(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock);
void cpu_clear_and_set(int c_cpu, int s_cpu,
cpu_set_t *cpu_set, ihk_spinlock_t *lock);
struct process *findthread_and_lock(int pid, int tid, ihk_spinlock_t **savelock, unsigned long *irqstate);
void process_unlock(void *savelock, unsigned long irqstate);
void release_cpuid(int cpuid);
struct thread *find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock);
void thread_unlock(struct thread *thread, struct mcs_rwlock_node_irqsave *lock);
struct process *find_process(int pid, struct mcs_rwlock_node_irqsave *lock);
void process_unlock(struct process *proc, struct mcs_rwlock_node_irqsave *lock);
void chain_process(struct process *);
void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
#endif

View File

@ -25,6 +25,7 @@ enum {
IPC_CREAT = 01000,
IPC_EXCL = 02000,
SHM_HUGETLB = 04000,
SHM_RDONLY = 010000,
SHM_RND = 020000,
SHM_REMAP = 040000,
@ -46,11 +47,14 @@ enum {
SHM_INFO = 14,
};
struct shmlock_user;
struct shmobj {
struct memobj memobj; /* must be first */
int index;
uint8_t padding[4];
int pgshift;
size_t real_segsz;
struct shmlock_user * user;
struct shmid_ds ds;
struct list_head page_list;
struct list_head chain; /* shmobj_list */
@ -75,9 +79,33 @@ struct shm_info {
uint64_t swap_successes;
};
struct shmlock_user {
uid_t ruid;
int padding;
size_t locked;
struct list_head chain;
};
extern ihk_spinlock_t shmlock_users_lock_body;
static inline void shmlock_users_lock(void)
{
ihk_mc_spinlock_lock_noirq(&shmlock_users_lock_body);
return;
}
static inline void shmlock_users_unlock(void)
{
ihk_mc_spinlock_unlock_noirq(&shmlock_users_lock_body);
return;
}
void shmobj_list_lock(void);
void shmobj_list_unlock(void);
int shmobj_create_indexed(struct shmid_ds *ds, struct shmobj **objp);
void shmobj_destroy(struct shmobj *obj);
void shmlock_user_free(struct shmlock_user *user);
int shmlock_user_get(uid_t ruid, struct shmlock_user **userp);
#endif /* HEADER_SHM_H */

Some files were not shown because too many files have changed in this diff Show More