Compare commits

...

260 Commits
1.0.0 ... 1.1.2

Author SHA1 Message Date
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
f74dcfc2a1 Modify mcreboot.sh for job scheduler
1. Don't complain when logname command doesn't exist
2016-09-01 19:27:18 +09:00
7c562d0539 support madvise(MADV_DONTFORK) 2016-09-01 11:22:53 +09:00
b5e4459a34 support AVX-512 registers 2016-08-30 18:39:33 +09:00
782122b681 mcctrl: fix to rus_vm_fault() call by kworker process 2016-08-22 13:00:28 +09:00
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
1ce1b17a85 Specify facility used by mcklogd via option
1. You can specify facility through -f option of mcreboot.sh.
   Example:
   mcreboot.sh -k 1 -f LOG_LOCAL6
   Note that you need to specify "-k 1" or "-k 2" to start mcklogd.
2. Kill mcklogd if needed in mcreboot.sh and mcstop+release.sh.
2016-08-17 17:52:44 +09:00
a2456c3ed2 Modify mcstop+release.sh for job scheduler
1. Remove ihk.ko
2. Output message to stderr and return one on error
2016-08-17 17:32:06 +09:00
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
15783f09a0 Modify mcreboot.sh for job scheduler
1. Add an option to specify owner of device files
2. Output message to stderr and return one on error
2016-08-17 15:07:13 +09:00
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
dac6f2883e mcctrl procfs: use semaphores instead of spinlocks to avoid sleeping in GFP_KERNEL kmalloc() in atomic context 2016-08-01 20:33:51 +09:00
c484f766fa schedule(): schedule a sleeping processes if it has pending signals 2016-07-28 11:42:00 +09:00
57690479bd read/patch_process_vm(): map non-LWK physical addresses properly 2016-07-22 20:48:54 +09:00
d0539a9cac eclair: make idle threads visible 2016-07-22 18:06:11 +09:00
4c8f583c0c split_large_page(): avoid panic when splitting "non-mapped" large pages 2016-07-14 17:11:52 +09:00
6118faffa9 pager_req_pfn(): use FAULT_FLAG_USER only if defined 2016-07-13 18:05:31 +09:00
dad6470c60 clone_thread: fork(2) copy sigstack infos from parent 2016-07-13 16:15:01 +09:00
46c37fc8f3 setfsgid: fix to didn't change fsgid 2016-07-13 15:54:52 +09:00
f6908f21a8 do_kill: wake PS_INTERRUPTIBLE process when send SIGKILL
sched_wakeup_thread: don't change process status if process status is PS_EXITED
2016-07-13 14:06:32 +09:00
01d9d9a5ba devobj: allow arbitrary size device file mappings 2016-07-12 17:02:19 +09:00
c43d993a4d mcstop+release-smp-x86.sh.in: unload mcctrl after OS shutdown 2016-07-11 16:40:06 +09:00
7d9bbecd7a mcctrl: use IHK OS notifiers to establish/tear down syscall channels
This patch eliminates the need for rmmod/insmod the mcctrl module
every time an OS instance is rebooted.
2016-07-11 16:22:50 +09:00
d135731398 do_syscall(): allow schedule for another thread (Intel MPI+OpenMP issue) 2016-07-05 18:54:51 +09:00
5c190beb04 save fpregs when to call sighandler
refs #50
2016-07-05 15:26:00 +09:00
fc66556f9f mcexec: error handling and propagation 2016-06-24 15:35:38 -07:00
648bacc90f device file mappings: communicate map flags and fault missing translations 2016-06-24 12:44:59 -07:00
dd37443fc7 PAPI support: performance counter's overflow.
and support mckfd fcntl.
2016-06-24 13:50:12 +09:00
e34322702a x86_init_perfctr: discover perf counters dynamically from MSRs 2016-06-22 10:47:57 -07:00
e12997e6a9 mcreboot: support for CPU cores (-c) and memory (-m) arguments 2016-06-21 09:10:06 -07:00
fabaa806d3 Revert "Make executor code include executer/config.h": breaks out-of-tree compile
This reverts commit d90900b6e6.
2016-06-21 08:51:45 +09:00
a83ad620c8 devobj: allow read only device file mappings (OFED 3.3 support) 2016-06-21 06:57:59 +09:00
d90900b6e6 Make executor code include executer/config.h
Make the code "executer/kernel/mcctrl/arch/x86_64/archdeps.c"
to include "executer/config.h" instead of
non-existent "executer/kernel/mcctrl/config.h".
2016-06-09 18:40:39 +09:00
6d9a88e9f4 binfmt_mcexec: support post-K specification 2016-06-08 09:53:39 +09:00
d0ee60f9e3 mcoverlayfs: supported only Linux kernel 4.0 2016-06-03 18:36:55 +09:00
14ec92518e KVM support: detect KVM and avoid touching unimplemented MSRs 2016-05-26 01:11:08 +09:00
435e2bdeb4 support for Linux 4.6: use get_user_pages_remote() 2016-05-24 09:39:04 +09:00
f06d8041e3 don't send SIGCONT when sending SIGSTOP derived from PTRACE_ATTACH
refs #747
2016-05-19 10:54:12 +09:00
9b35eaca42 remote_flush_tlb_cpumask() dead locking
refs #728
2016-05-10 14:02:25 +09:00
130b1f4327 update PAPI support. other process and child process monitoring. 2016-04-26 19:01:47 +09:00
921280f85c Docker support: use task_XX_vnr() functions for accessing correct namespace 2016-04-21 09:59:49 -07:00
d4a0b32f06 support large pages 2016-04-21 23:22:55 +09:00
b3bec32e99 update_process_page_table: refactor 2016-04-21 23:22:55 +09:00
2048980820 remove ihk_mc_pt_alloc_range() 2016-04-21 23:22:54 +09:00
176f6d23a9 ihk_mc_pt_virt_to_pagemap: refactor 2016-04-21 23:22:54 +09:00
328175547f Revert "fix REQ-37: remap_one_page: remove to check page size"
This reverts commit 6790126a23.

- reverted commit should remove a 'pgsize' check in remap_one_page()
  instead of a 'pgsize' check in pte_make_fileoff().
- In IA-32e, PTE format varies with page size. Therefore 'pgsize'
  parameter of pte_make_fileoff() is preferable.
2016-04-21 23:22:54 +09:00
e2e0fad849 arch_clear_host_user_space: set zero to args[2]
to avoid duplicated per_proc_list entry.
2016-04-21 23:22:54 +09:00
397bf3f4a6 wait_zombie: don't wait attached process
refs #726
2016-04-21 20:28:36 +09:00
aa77228453 resupport ptrace(PTRACE_ATTACH)
refs #733
2016-04-21 20:13:27 +09:00
82cb8f95ed update PAPI support. 2016-04-18 13:07:45 +09:00
3f2b4e7282 do_wait: unlink child from children_list if child terminated
refs #724
2016-04-14 10:25:12 +09:00
d6784bb4a5 update auto-generated files 2016-04-11 22:25:53 +09:00
1bb948f43b hwloc support 2016-04-11 22:25:27 +09:00
2a1823d52c vdso: set enable bit of pvti_msr 2016-04-11 22:20:39 +09:00
89943dc5ba vdso: set physical address at pvti_msr 2016-04-11 22:20:39 +09:00
fceb02a44a vdso: add zero clear for pvti 2016-04-11 22:20:38 +09:00
7298d8e179 vdso: correct pvti array element type
struct pvclock_vsyscall_time_info <-- struct pvclock_vcpu_time_info
2016-04-11 22:20:38 +09:00
6f32544dde vdso: add static cast 2016-04-11 22:20:38 +09:00
10d248b3cc mcexec: include config.h 2016-04-11 22:20:38 +09:00
fb32120659 make mcoverlayfs optional (default: enabled) 2016-04-02 15:43:35 -04:00
73de203c16 update auto-generated files 2016-03-28 22:57:45 +09:00
41bb2ab5e6 support vdso which borrows clocksource from linux 2016-03-28 22:57:44 +09:00
a587c8f5e5 x86: encode cpu# in IA32_TSC_AUX and size of GDTe#15 2016-03-28 22:57:44 +09:00
0c53a5ca35 add NOPHYS which means no physical memory 2016-03-28 22:57:44 +09:00
c760a01a79 add pte_get_attr() 2016-03-28 22:57:44 +09:00
a2c29e8abf correct the value of tod_data.origin
tod_data.origin should hold a time when TSC is zero.
2016-03-28 22:57:39 +09:00
18add6a9bd shmctl(IPC_RMID): fix wrong owner/creator checking (revised)
Don't check owner/creator of the segment in case of superuser.
2016-03-28 16:02:24 +09:00
a083e6c2bf Revert "shmctl(IPC_RMID): fix wrong owner/creator checking"
This reverts commit 8b5b075f4c.

The reverted commit modifies IPC_SET instead of IPC_RMID.
2016-03-28 16:00:39 +09:00
a2548f5421 Revert "fix REQ-42"
This reverts commit 4a0682bbc1.

The reverted commit appears to be wrong, for example:
- arch_range_check()'s arguments and parameters are mismatch.
- arch_range_check() implementation is not checking range.

Conflicts:
	kernel/syscall.c
2016-03-28 13:51:57 +09:00
6790126a23 fix REQ-37: remap_one_page: remove to check page size 2016-03-27 14:05:00 +09:00
1195549f41 fix REQ-19: some syscalls change how to access user space 2016-03-27 11:43:53 +09:00
b0096a2740 fix REQ-51 2016-03-26 12:23:51 +09:00
a11479eba8 fix REQ-48 2016-03-25 13:05:53 +09:00
12eaea401e fix REQ-46 2016-03-25 12:59:18 +09:00
31595b7409 fix REQ-43 2016-03-25 12:57:31 +09:00
4a0682bbc1 fix REQ-42 2016-03-24 19:14:50 +09:00
932a287437 fix REQ-40 2016-03-24 13:46:13 +09:00
670741ae40 fix REQ-39 2016-03-24 13:45:15 +09:00
70b27e06ff eclair: change default kernel to ./mckernel.img 2016-03-23 20:00:57 +09:00
4c38ddb623 update auto-generated files 2016-03-23 20:00:57 +09:00
6f00ddced6 move eclair from ihk repository 2016-03-23 20:00:57 +09:00
c0eecd63c9 update auto-generated files 2016-03-23 20:00:57 +09:00
1fd0b03e78 move config.h.in
from executer/kernel/mcctrl/config.h.in
to   executer/config.h.in
2016-03-23 20:00:57 +09:00
6c59de9300 expand AC_PROT_CC only once 2016-03-23 20:00:57 +09:00
b1309a5d53 map PIE at map_end instead of at user_start 2016-03-23 19:14:28 +09:00
489cd6d1a2 refactor prepare_process_ranges_args_envs() 2016-03-23 19:14:28 +09:00
c9cc4330c8 mincore: take into account pages cached in memobj 2016-03-23 19:14:28 +09:00
604f846cd2 mincore: check [start..start+len) is in user region 2016-03-23 19:14:28 +09:00
e939cf6862 mincore: cosmetic changes 2016-03-23 19:14:28 +09:00
72f2e5ebe0 shmobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
bd7dddd415 fileobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
fbd9dc878b memobj: add lookup_page method 2016-03-23 19:14:28 +09:00
d6c51ff997 treat memory devices as regular files,
to enable processes to mmap() /dev/zero
2016-03-23 19:14:27 +09:00
86ac51157c add error checks to shmctl(SHM_UNLOCK) 2016-03-23 19:14:27 +09:00
b73fa2b972 add error checks to shmctl(SHM_LOCK) 2016-03-23 19:14:27 +09:00
798f69bceb add has_cap_ipc_lock() 2016-03-23 19:14:27 +09:00
e8be52a1ff shm: trace the amount of locked segment per user 2016-03-23 19:14:27 +09:00
8b5b075f4c shmctl(IPC_RMID): fix wrong owner/creator checking
Don't check owner/creator of the segment in case of superuser.
2016-03-23 19:14:27 +09:00
b214fc278a add has_cap_sys_admin() 2016-03-23 19:14:27 +09:00
b3ae7f46bd add rlim_t (a type of rlim_cur and rlim_max) 2016-03-23 19:14:27 +09:00
48167d3223 shmget: add "shmflg" checks for SHM_HUGE* 2016-03-23 19:14:27 +09:00
d65135c040 move sys_shmget() into arch-dependent code 2016-03-23 19:14:27 +09:00
1761acc4c3 eliminate geteuid(), getegid() and getpid() 2016-03-23 19:04:32 +09:00
d4d93df032 mmap: add "flags" checks for MAP_HUGE* 2016-03-23 19:04:32 +09:00
261bddb999 add a member pgshift into struct vm_range
pgshift indicates a page size in the range.
2016-03-23 19:04:32 +09:00
1a3bc851af mprotect: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
15f572ef9c mmap: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
81690c5b5a mmap: cosmetic changes 2016-03-23 19:04:32 +09:00
832c0f9afd refactor copy_user_ranges() 2016-03-23 19:04:32 +09:00
f92cac7751 add type casting to the argument of getlong_user() 2016-03-23 19:04:32 +09:00
e74eb1dd51 add some prototypes to <memory.h> 2016-03-23 19:04:32 +09:00
8f7b9072ea refactor some copyin/copyout functions
- copy_from_user()
- getlong_user()
- getint_user()
- copy_to_user()
- setlong_user()
- setint_user()
2016-03-23 19:04:32 +09:00
4595aa3079 pte_visitor_t(): change "pgsize" into "pgshift" 2016-03-23 19:04:32 +09:00
807d294ac4 signalfd4: fix initialize 2016-06-03 20:58:02 +09:00
c947dd0d49 sysfs: support /sys/devices/system/cpu/online 2016-03-22 20:25:34 +09:00
d192e6c0fe modify PAPI support 2016-03-22 15:52:59 +09:00
7dbbcb362f add PAPI support 2016-03-22 15:27:19 +09:00
593cf98015 add ACSL annotation 2016-03-16 15:42:32 +09:00
8dd9f5ef3f support profil 2016-03-12 16:47:19 +09:00
0eaf058a4f mcexec: -lrt to Makefile.in for supporting clock_gettime() on SUSE 2016-03-12 05:24:14 +09:00
1aac2c8e23 add CPU timer initialization (refs #402)
There is no actual initialization in x86 now.
The initialization rely on hardware reset and Linux initialization.
2016-03-11 19:20:37 +09:00
70e8dd7979 remove initialization of TSC (refs #362) 2016-03-11 19:17:29 +09:00
eb0700359b fix REQ-36 2016-03-10 10:33:38 +09:00
3f16a9443e ptrace_report_signal: save debug regs before to send SIGCHLD to tracer 2016-03-09 22:29:51 +09:00
bf0cf0a346 fix REQ-31 2016-03-08 15:19:03 +09:00
14b868907b fix REQ-27 2016-03-07 18:52:08 +09:00
dbc778e4fa support getrusage (work in progress) 2016-03-07 17:06:44 +09:00
7fac03d4de sysfs: support /sys/devices/system/cpu/offline,online,possible,present 2016-03-04 13:48:06 +09:00
26c0180374 rwlock_reader_lock: fix lock list jammed up 2016-03-03 22:47:48 +09:00
8ebb3a4231 schedule: migration free last thread if terminated 2016-03-03 22:44:44 +09:00
f1f1ba9c8c mcs_rwlock_reader_lock: temporary fix 2016-03-01 19:11:42 +09:00
6ce00b5f0f sysfs: samples of snooping ops 2016-02-29 19:59:04 +09:00
4ec0e02a89 sysfs: add snooping ops 2016-02-29 19:23:01 +09:00
8f9192ac36 mcctrl: workaround for out-of-tree build (2/2)
- update auto-generated file
2016-02-29 19:18:08 +09:00
80ce123ab6 mcctrl: workaround for out-of-tree build (1/2) 2016-02-29 19:18:08 +09:00
1dc8513cd3 fix REQ-20 2016-02-26 16:18:30 +09:00
b0054643c0 REQ-18 2016-02-26 16:17:23 +09:00
972ff73ecf mcexec: fix readlink
refs #692
2016-02-25 16:08:42 +09:00
1f8a859b47 mcctrl: update auto-generated files 2016-02-24 21:34:48 +09:00
2601d8a36f mcctrl: use zap_page_range() instead of madvise() 2016-02-24 21:34:48 +09:00
a713c2fcaa fix REQ-16 2016-02-24 20:58:07 +09:00
c4c5e435cc fix REQ-12 2016-02-24 20:57:45 +09:00
853b56c784 mcreboot-smp-x86.sh: add mount to ceate /tmp/mcos/linux_proc from /proc 2016-02-24 19:24:37 +09:00
863a5c5e5f fix REQ-2, REQ-6, REQ-8 2016-02-23 16:32:17 +09:00
ebce1cb031 Merge branch 'master' of postpeta.pccluster.org:mckernel 2016-02-22 13:34:00 +09:00
fff7744907 mcklogd support 2016-02-22 13:32:20 +09:00
27c3ed7e96 remove debug print 2016-02-21 15:17:42 +09:00
e2b28da32f signal handler support gdb stepi command 2016-02-21 14:55:34 +09:00
2c50b716fd support setitimer/getitimer 2016-02-19 15:25:05 +09:00
307b2b8da5 clock_gettime: support clock_id CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID 2016-02-18 17:43:13 +09:00
eba2be8a35 support times 2016-02-18 13:14:18 +09:00
a997af71be support tkill
refs #664
2016-02-17 12:48:12 +09:00
e7c37b8000 mcreboot-smp-x86.sh: fix Failed to mount /sys/devices/virtual/mcos/mcos0/sys 2016-02-16 16:05:40 +09:00
8c40f94aa8 /proc/<PID>/mem: support read/write 2016-02-16 13:21:29 +09:00
da13bd408a mcexec: add to initialize some structures (REQ-56)
refs #718
2016-02-15 18:20:58 +09:00
c328d26b8d procfs(/proc/<PID>/task/<TID>/stat): fix memory corruption
refs #722
2016-02-15 15:10:00 +09:00
6cda6792a9 process_msg_init_acked: don't use PA 2016-02-14 22:47:52 +09:00
2d3fda1d0b flatten_strings: fix align (REQ-1) 2016-02-14 22:36:58 +09:00
5d43c135db procfs: (temporary fix) unsupported files are closed 2016-02-10 17:10:54 +09:00
a866192db7 refactoring /proc 2016-02-10 08:11:02 +09:00
c0cc6ac6db Add skeleton for perf_event_open. 2016-02-09 14:54:53 +09:00
14c5bc08c2 mcexec: check Linux version from actual kernel tree instead of system wide include 2016-02-09 14:07:08 +09:00
7f01d273d0 mcctrl: fix out-of-tree build (not finding config.h) 2016-02-09 12:45:58 +09:00
137e0a799c mcexec: unshare and mount request through mcctrl 2016-02-08 16:27:03 +09:00
f214ff1b57 mcctrl: add MCEXEC_UP_SYS_MOUNT, MCEXEC_UP_SYS_UNSHARE 2016-02-08 16:00:52 +09:00
0ce698eb1f mcexec: support for /sys mounted by mcoverlayfs 2016-02-08 11:36:03 +09:00
e601248bdc procfs: fix mcos%d/PID/auxv size 2016-02-08 09:38:27 +09:00
d8eeab9b89 mcoverlayfs: enable out of tree compilation 2016-02-01 00:35:53 +09:00
fdf031ac16 procfs: chown procfs entries (temporary hack)
refs #651
refs #699
2016-01-28 16:29:46 +09:00
1ffe740153 sysfs sample 2016-01-26 18:08:25 +09:00
72968d613e support sysfs interface for mcctrl 2016-01-26 18:08:25 +09:00
2e98f875c3 sysfs: attempt to remove empty directories only 2016-01-26 18:08:25 +09:00
a6cb9a6b93 sysfs: lookup_i(): refactoring 2016-01-26 18:08:25 +09:00
da0a91b9f7 mcctrl: denote full path in /proc/PID/exe 2016-01-26 16:21:52 +09:00
f093786bec x86: populating PML4e and PDPTe is now lock-free 2016-01-25 09:17:06 +09:00
368f155328 sigaction: support SA_NODEFER
refs #698
2016-01-21 18:48:10 +09:00
425f920013 mcctrl: delete procfs entries recursively to avoid leaking 2016-01-21 18:15:59 +09:00
dbddf37579 set termsig to mcexec spawned process 2016-01-21 12:08:47 +09:00
fa7a5ccd11 support /proc/self/exe (needed for GDB to attach to an existing process) 2016-01-19 18:23:02 +09:00
172bf0a389 sched_setaffinity: add permission check 2016-01-15 12:05:18 +09:00
9bafd166e3 futex: support FUTEX_CLOCK_REALTIME 2016-01-14 16:18:49 +09:00
2e31b8abd1 clock_gettime: clock_id != CLOCK_REALTIME -> offload to linux 2016-01-13 14:04:06 +09:00
a42ee00101 NR_execve: initialize local variable 'shell'
refs #696
2016-01-13 11:16:19 +09:00
f6935b0869 ptrace_setsiginfo: update recieved siginfo 2016-01-11 17:37:29 +09:00
03a7763a5e ptrace_conf: set received siginfo to default siginfo 2016-01-11 17:10:30 +09:00
3a2f7b0106 clone: support CLONE_PARENT 2016-01-11 16:49:02 +09:00
2819ec2197 fix extra copy which might cause page faults 2016-01-06 21:12:57 +09:00
f7d81a9281 fix typo 2016-01-06 21:12:57 +09:00
914faf042d add missing kfree() for channel lookup table 2016-01-06 21:12:57 +09:00
75c6a94839 delete struct member 'type' from address_space structure 2016-01-06 20:17:00 +09:00
f7b5b48266 support x2apic 2016-01-06 13:53:02 +09:00
f9bd83c726 ptrace: fix PTRACE_GETREGSET, PTRACE_SETREGSET bug
refs #608
2015-12-28 19:45:50 +09:00
edc275ce4f delete free_list_lock 2015-12-28 11:31:42 +09:00
d00ea61d1a ptrace_wakeup_sig: fix thread lock 2015-12-28 10:33:07 +09:00
01117e92c9 append file path to symlink if link path is absolute
refs #643
2015-12-25 15:50:39 +09:00
d477096cb0 getrlimit, setrlimit: offload to linux when an unknown parameter was specified
refs #660
2015-12-25 15:35:33 +09:00
f44ddfa3b3 support sigtimedwait 2015-12-24 12:35:45 +09:00
e0acd254b1 do_process_vm_read_writev: use process hash for remote process search 2015-12-22 09:47:00 +09:00
d0507f7e9f process_read/write_vm(): fix LTP bugs 2015-12-18 15:58:51 +09:00
0f8b2aba22 reset signal handlers when execve called 2015-12-18 12:46:53 +09:00
7e5c7445e2 fix ptrace_detach bug
refs #662
2015-12-16 17:41:57 +09:00
a055fb525d sysfs sample 2015-12-16 13:42:30 +09:00
8cb72df663 support McKernel's sysfs tree 2015-12-16 13:42:30 +09:00
e805249651 add strrchr() 2015-12-16 13:42:30 +09:00
06a7889e1f chown root mcexec 2015-12-15 16:22:14 +09:00
20deed09f0 mcexec: support for /proc mounted by mcoverlayfs 2015-12-14 14:47:05 +09:00
bb81f84709 support PIE executable for PVAS 2015-12-14 11:05:28 +09:00
5c1dad1660 GDB: async-shell.exp
refs #650
2015-11-26 17:07:13 +09:00
7f2220b8e9 set '\0' termination to readlink result.
refs #643
2015-11-26 16:58:15 +09:00
65dda3f24e mcoverlayfs: support mount options(nocopyupw, nofscheck) 2015-11-25 15:34:58 +09:00
544971d665 modify for PVAS 2015-11-25 14:27:20 +09:00
dbddab4356 mcoverlayfs: add overlayfs of the original(kernel 4.0.9) 2015-11-25 13:23:49 +09:00
12eb8a9bb0 mcctrl: move mcctrl to executer/kernel/mcctrl 2015-11-24 15:42:04 +09:00
828a3ea57a futex(): support for cross address-space futexes 2015-11-24 14:58:04 +09:00
124 changed files with 25390 additions and 5575 deletions

View File

@ -1,9 +1,11 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@(cd executer/kernel; make modules)
@(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -16,7 +18,8 @@ all::
esac
install::
@(cd executer/kernel; make install)
@(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -46,6 +49,9 @@ install::
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
@ -56,7 +62,8 @@ install::
esac
clean::
@(cd executer/kernel; make clean)
@(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \

View File

@ -10,7 +10,7 @@
* HISTORY
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)

View File

@ -28,9 +28,12 @@
#include <signal.h>
#include <process.h>
#include <cls.h>
#include <prctl.h>
#include <page.h>
#define LAPIC_ID 0x020
#define LAPIC_TIMER 0x320
#define LAPIC_LVTPC 0x340
#define LAPIC_TIMER_INITIAL 0x380
#define LAPIC_TIMER_CURRENT 0x390
#define LAPIC_TIMER_DIVIDE 0x3e0
@ -40,6 +43,7 @@
#define LAPIC_ICR2 0x310
#define LAPIC_ESR 0x280
#define LOCAL_TIMER_VECTOR 0xef
#define LOCAL_PERF_VECTOR 0xf0
#define APIC_INT_LEVELTRIG 0x08000
#define APIC_INT_ASSERT 0x04000
@ -52,15 +56,30 @@
#define APIC_DIVISOR 16
#define APIC_LVT_TIMER_PERIODIC (1 << 17)
#define APIC_BASE_MSR 0x800
#define IA32_X2APIC_APICID 0x802
#define IA32_X2APIC_ICR 0x830
#define X2APIC_ENABLE (1UL << 10)
#define NMI_VECTOR 0x02
//#define DEBUG_PRINT_CPU
#ifdef DEBUG_PRINT_CPU
#define dkprintf kprintf
#define ekprintf kprintf
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf kprintf
#endif
static void *lapic_vp;
static int x2apic;
static void (*lapic_write)(int reg, unsigned int value);
static unsigned int (*lapic_read)(int reg);
static void (*lapic_icr_write)(unsigned int h, unsigned int l);
static void (*lapic_wait_icr_idle)(void);
void (*x86_issue_ipi)(unsigned int apicid, unsigned int low);
int running_on_kvm(void);
void init_processors_local(int max_id);
void assign_processor_id(void);
@ -69,7 +88,9 @@ void x86_set_warm_reset(unsigned long ip, char *first_page_va);
void x86_init_perfctr(void);
int gettime_local_support = 0;
extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt);
extern int kprintf(const char *format, ...);
extern int interrupt_from_user(void *);
static struct idt_entry{
uint32_t desc[4];
@ -88,6 +109,12 @@ static uint64_t gdt[] __attribute__((aligned(16))) = {
0x00aff3000000ffff, /* 56 : USER_DS */
0x0000890000000067, /* 64 : TSS */
0, /* (72: TSS) */
0, /* 80 */
0, /* 88 */
0, /* 96 */
0, /* 104 */
0, /* 112 */
0x0000f10000000000, /* 120 : GETCPU */
};
struct tss64 tss __attribute__((aligned(16)));
@ -123,6 +150,12 @@ extern char debug_exception[], int3_exception[];
uint64_t boot_pat_state = 0;
int no_turbo = 0; /* May be updated by early parsing of kargs */
extern int num_processors; /* kernel/ap.c */
struct pvclock_vsyscall_time_info *pvti = NULL;
int pvti_npages;
static long pvti_msr = -1;
static void init_idt(void)
{
int i;
@ -148,6 +181,8 @@ static void init_idt(void)
}
static int xsave_available = 0;
static int xsave_size = 0;
static uint64_t xsave_mask = 0x0;
void init_fpu(void)
{
@ -191,6 +226,26 @@ void init_fpu(void)
xsetbv(0, reg);
dkprintf("init_fpu(): AVX init: XCR0 = 0x%016lX\n", reg);
}
if(xsave_available){
unsigned long eax;
unsigned long ebx;
unsigned long ecx;
unsigned long edx;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (0x0d), "c" (0x00));
xsave_size = ecx;
dkprintf("init_fpu(): xsave_size = %d\n", xsave_size);
if ((eax & (1 << 5)) && (eax & (1 << 6)) && (eax & (1 << 7))) {
/* Set xcr0[7:5] to enable avx-512 ops */
reg = xgetbv(0);
reg |= 0xe6;
xsetbv(0, reg);
dkprintf("init_fpu(): AVX-512 init: XCR0 = 0x%016lX\n", reg);
}
}
xsave_mask = xgetbv(0);
dkprintf("init_fpu(): xsave_mask = 0x%016lX\n", xsave_mask);
/* TODO: set MSR_IA32_XSS to enable xsaves/xrstors */
@ -201,6 +256,17 @@ void init_fpu(void)
asm volatile("finit");
}
int
get_xsave_size()
{
return xsave_size;
}
uint64_t get_xsave_mask()
{
return xsave_mask;
}
void reload_gdt(struct x86_desc_ptr *gdt_ptr)
{
asm volatile("pushq %1\n"
@ -238,25 +304,39 @@ void init_gdt(void)
reload_gdt(&gdt_desc);
}
static void *lapic_vp;
void lapic_write(int reg, unsigned int value)
static void
apic_write(int reg, unsigned int value)
{
*(volatile unsigned int *)((char *)lapic_vp + reg) = value;
}
unsigned int lapic_read(int reg)
static void
x2apic_write(int reg, unsigned int value)
{
reg >>= 4;
reg |= APIC_BASE_MSR;
wrmsr(reg, value);
}
static unsigned int
apic_read(int reg)
{
return *(volatile unsigned int *)((char *)lapic_vp + reg);
}
void lapic_icr_write(unsigned int h, unsigned int l)
static unsigned int
x2apic_read(int reg)
{
lapic_write(LAPIC_ICR2, (unsigned int)h);
lapic_write(LAPIC_ICR0, l);
unsigned long value;
reg >>= 4;
reg |= APIC_BASE_MSR;
value = rdmsr(reg);
return (int)value;
}
void lapic_timer_enable(unsigned int clocks)
void
lapic_timer_enable(unsigned int clocks)
{
unsigned int lvtt_value;
@ -268,11 +348,117 @@ void lapic_timer_enable(unsigned int clocks)
lapic_write(LAPIC_TIMER, lvtt_value);
}
void lapic_timer_disable()
void
lapic_timer_disable()
{
lapic_write(LAPIC_TIMER_INITIAL, 0);
}
void
lapic_ack(void)
{
lapic_write(LAPIC_EOI, 0);
}
static void
x2apic_wait_icr_idle(void)
{
}
static void
apic_wait_icr_idle(void)
{
while (lapic_read(LAPIC_ICR0) & APIC_ICR_BUSY) {
cpu_pause();
}
}
static void
x2apic_icr_write(unsigned int low, unsigned int apicid)
{
wrmsr(IA32_X2APIC_ICR, (((unsigned long)apicid) << 32) | low);
}
static void
apic_icr_write(unsigned int h, unsigned int l)
{
lapic_write(LAPIC_ICR2, (unsigned int)h);
lapic_write(LAPIC_ICR0, l);
}
static void
x2apic_x86_issue_ipi(unsigned int apicid, unsigned int low)
{
unsigned long icr = low;
unsigned long flags;
ihk_mc_mb();
flags = cpu_disable_interrupt_save();
x2apic_icr_write(icr, apicid);
cpu_restore_interrupt(flags);
}
static void
apic_x86_issue_ipi(unsigned int apicid, unsigned int low)
{
unsigned long flags;
flags = cpu_disable_interrupt_save();
apic_wait_icr_idle();
apic_icr_write(apicid << LAPIC_ICR_ID_SHIFT, low);
cpu_restore_interrupt(flags);
}
unsigned long
x2apic_is_enabled()
{
unsigned long msr;
msr = rdmsr(MSR_IA32_APIC_BASE);
return (msr & X2APIC_ENABLE);
}
void init_lapic_bsp(void)
{
if(x2apic_is_enabled()){
x2apic = 1;
lapic_write = x2apic_write;
lapic_read = x2apic_read;
lapic_icr_write = x2apic_icr_write;
lapic_wait_icr_idle = x2apic_wait_icr_idle;
x86_issue_ipi = x2apic_x86_issue_ipi;
}
else{
x2apic = 0;
lapic_write = apic_write;
lapic_read = apic_read;
lapic_icr_write = apic_icr_write;
lapic_wait_icr_idle = apic_wait_icr_idle;
x86_issue_ipi = apic_x86_issue_ipi;
}
}
void
init_lapic()
{
if(!x2apic){
unsigned long baseaddr;
/* Enable Local APIC */
baseaddr = rdmsr(MSR_IA32_APIC_BASE);
if (!lapic_vp) {
lapic_vp = map_fixed_area(baseaddr & PAGE_MASK, PAGE_SIZE, 1);
}
baseaddr |= 0x800;
wrmsr(MSR_IA32_APIC_BASE, baseaddr);
}
lapic_write(LAPIC_SPURIOUS, 0x1ff);
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
}
void print_msr(int idx)
{
int bit;
@ -302,6 +488,8 @@ void init_pstate_and_turbo(void)
uint64_t value;
uint64_t eax, ecx;
if (running_on_kvm()) return;
asm volatile("cpuid" : "=a" (eax), "=c" (ecx) : "a" (0x6) : "%rbx", "%rdx");
if (!(ecx & 0x01)) {
/* P-states and/or Turbo Boost are not supported. */
@ -423,26 +611,6 @@ void init_pat(void)
dkprintf("PAT support detected and reconfigured.\n");
}
void init_lapic(void)
{
unsigned long baseaddr;
/* Enable Local APIC */
baseaddr = rdmsr(MSR_IA32_APIC_BASE);
if (!lapic_vp) {
lapic_vp = map_fixed_area(baseaddr & PAGE_MASK, PAGE_SIZE, 1);
}
baseaddr |= 0x800;
wrmsr(MSR_IA32_APIC_BASE, baseaddr);
lapic_write(LAPIC_SPURIOUS, 0x1ff);
}
void lapic_ack(void)
{
lapic_write(LAPIC_EOI, 0);
}
static void set_kstack(unsigned long ptr)
{
struct x86_cpu_local_variables *v;
@ -456,11 +624,17 @@ static void init_smp_processor(void)
{
struct x86_cpu_local_variables *v;
unsigned long tss_addr;
unsigned node_cpu;
v = get_x86_this_cpu_local();
tss_addr = (unsigned long)&v->tss;
v->apic_id = lapic_read(LAPIC_ID) >> LAPIC_ID_SHIFT;
if(x2apic_is_enabled()){
v->apic_id = rdmsr(IA32_X2APIC_APICID);
}
else{
v->apic_id = lapic_read(LAPIC_ID) >> LAPIC_ID_SHIFT;
}
memcpy(v->gdt, gdt, sizeof(v->gdt));
@ -471,6 +645,9 @@ static void init_smp_processor(void)
| (0x89UL << 40) | ((tss_addr & 0xff000000) << 32);
v->gdt[GLOBAL_TSS_ENTRY + 1] = (tss_addr >> 32);
node_cpu = v->processor_id; /* assumes NUMA node 0 */
v->gdt[GETCPU_ENTRY] |= node_cpu;
v->gdt_ptr.size = sizeof(v->gdt) - 1;
v->gdt_ptr.address = (unsigned long)v->gdt;
@ -478,6 +655,11 @@ static void init_smp_processor(void)
reload_gdt(&v->gdt_ptr);
set_kstack((unsigned long)get_x86_this_cpu_kstack());
/* MSR_IA32_TSC_AUX on KVM seems broken */
if (running_on_kvm()) return;
#define MSR_IA32_TSC_AUX 0xc0000103
wrmsr(MSR_IA32_TSC_AUX, node_cpu);
}
static char *trampoline_va, *first_page_va;
@ -497,9 +679,6 @@ void ihk_mc_init_ap(void)
kprintf("# of cpus : %d\n", cpu_info->ncpus);
init_processors_local(cpu_info->ncpus);
kprintf("IKC IRQ vector: %d, IKC target CPU APIC: %d\n",
ihk_ikc_irq, ihk_ikc_irq_apicid);
/* Do initialization for THIS cpu (BSP) */
assign_processor_id();
@ -621,6 +800,8 @@ void setup_x86(void)
check_no_execute();
init_lapic_bsp();
init_cpu();
init_gettime_support();
@ -671,6 +852,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
lapic_ack();
++v->in_interrupt;
set_cputime(interrupt_from_user(regs)? 1: 2);
dkprintf("CPU[%d] got interrupt, vector: %d, RIP: 0x%lX\n",
ihk_mc_get_processor_id(), vector, regs->gpr.rip);
@ -732,6 +915,38 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
}
else if (vector == LOCAL_PERF_VECTOR) {
struct siginfo info;
unsigned long value;
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
long irqstate;
struct mckfd *fdp;
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
value = rdmsr(MSR_PERF_GLOBAL_STATUS);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0);
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
for(fdp = proc->mckfd; fdp; fdp = fdp->next) {
if(fdp->sig_no > 0)
break;
}
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
if(fdp) {
memset(&info, '\0', sizeof info);
info.si_signo = fdp->sig_no;
info._sifields._sigfault.si_addr = (void *)regs->gpr.rip;
info._sifields._sigpoll.si_fd = fdp->fd;
set_signal(fdp->sig_no, regs, &info);
}
else {
set_signal(SIGIO, regs, NULL);
}
}
else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START &&
vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) {
@ -745,14 +960,19 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
}
}
check_signal(0, regs, 0);
check_need_resched();
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
--v->in_interrupt;
}
void gpe_handler(struct x86_user_context *regs)
{
set_cputime(interrupt_from_user(regs)? 1: 2);
kprintf("General protection fault (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
arch_show_interrupt_context(regs);
@ -760,8 +980,12 @@ void gpe_handler(struct x86_user_context *regs)
panic("gpe_handler");
}
set_signal(SIGSEGV, regs, NULL);
check_signal(0, regs, 0);
check_need_resched();
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
// panic("GPF");
}
@ -771,6 +995,7 @@ void debug_handler(struct x86_user_context *regs)
int si_code = 0;
struct siginfo info;
set_cputime(interrupt_from_user(regs)? 1: 2);
#ifdef DEBUG_PRINT_CPU
kprintf("debug exception (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
@ -788,14 +1013,19 @@ void debug_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info);
info.si_code = si_code;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
}
void int3_handler(struct x86_user_context *regs)
{
struct siginfo info;
set_cputime(interrupt_from_user(regs)? 1: 2);
#ifdef DEBUG_PRINT_CPU
kprintf("int3 exception (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
@ -805,25 +1035,67 @@ void int3_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info);
info.si_code = TRAP_BRKPT;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
}
static void wait_icr_idle(void)
{
while (lapic_read(LAPIC_ICR0) & APIC_ICR_BUSY) {
cpu_pause();
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
}
void x86_issue_ipi(unsigned int apicid, unsigned int low)
void
unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
{
unsigned long flags;
const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
flags = cpu_disable_interrupt_save();
wait_icr_idle();
lapic_icr_write(apicid << LAPIC_ICR_ID_SHIFT, low);
cpu_restore_interrupt(flags);
irqflags = kprintf_lock();
__kprintf("Page fault for 0x%lx\n", address);
__kprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
(error & PF_USER ? "user" : "kernel"),
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
__kprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
//dkprintf("now dump a core file\n");
//coredump(proc, regs);
#ifdef DEBUG_PRINT_MEM
{
uint64_t *sp = (void *)REGS_GET_STACK_POINTER(regs);
kprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n",
sp[0], sp[1], sp[2], sp[3]);
}
#endif
return;
}
static void outb(uint8_t v, uint16_t port)
@ -852,12 +1124,12 @@ static void __x86_wakeup(int apicid, unsigned long ip)
x86_issue_ipi(apicid,
APIC_INT_LEVELTRIG | APIC_DM_INIT);
wait_icr_idle();
lapic_wait_icr_idle();
while (retry--) {
lapic_read(LAPIC_ESR);
x86_issue_ipi(apicid, APIC_DM_STARTUP | (ip >> 12));
wait_icr_idle();
lapic_wait_icr_idle();
arch_delay(200);
@ -868,6 +1140,10 @@ static void __x86_wakeup(int apicid, unsigned long ip)
/** IHK Functions **/
/*@
@ assigns \nothing;
@ ensures \interrupt_disabled == 0;
@*/
void cpu_halt(void)
{
asm volatile("hlt");
@ -1170,6 +1446,10 @@ void arch_show_extended_context(void)
}
#endif
/*@
@ requires \valid(reg);
@ assigns \nothing;
@*/
void arch_show_interrupt_context(const void *reg)
{
const struct x86_user_context *uctx = reg;
@ -1258,8 +1538,8 @@ int ihk_mc_interrupt_cpu(int cpu, int vector)
}
/*@
@ requires \valid(proc);
@ ensures proc->fp_regs == NULL;
@ requires \valid(thread);
@ ensures thread->fp_regs == NULL;
@*/
void
release_fp_regs(struct thread *thread)
@ -1269,18 +1549,23 @@ release_fp_regs(struct thread *thread)
if (thread && !thread->fp_regs)
return;
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("release_fp_regs: pages=%d\n", pages);
ihk_mc_free_pages(thread->fp_regs, pages);
thread->fp_regs = NULL;
}
/*@
@ requires \valid(thread);
@*/
void
save_fp_regs(struct thread *thread)
{
int pages;
if (!thread->fp_regs) {
pages = (sizeof(fp_regs_struct) + 4095) >> 12;
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
dkprintf("save_fp_regs: pages=%d\n", pages);
thread->fp_regs = ihk_mc_alloc_pages(pages, IHK_MC_AP_NOWAIT);
if (!thread->fp_regs) {
@ -1289,14 +1574,15 @@ save_fp_regs(struct thread *thread)
}
memset(thread->fp_regs, 0, sizeof(fp_regs_struct));
memset(thread->fp_regs, 0, pages * PAGE_SIZE);
}
if (xsave_available) {
unsigned int low, high;
/* Request full save of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full save of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xsave %0" : : "m" (*thread->fp_regs), "a" (low), "d" (high)
: "memory");
@ -1305,6 +1591,10 @@ save_fp_regs(struct thread *thread)
}
}
/*@
@ requires \valid(thread);
@ assigns thread->fp_regs;
@*/
void
restore_fp_regs(struct thread *thread)
{
@ -1314,9 +1604,9 @@ restore_fp_regs(struct thread *thread)
if (xsave_available) {
unsigned int low, high;
/* Request full restore of x87, SSE and AVX states */
low = 0x7;
high = 0;
/* Request full restore of x87, SSE, AVX and AVX-512 states */
low = (unsigned int)xsave_mask;
high = (unsigned int)(xsave_mask >> 32);
asm volatile("xrstor %0" : : "m" (*thread->fp_regs),
"a" (low), "d" (high));
@ -1353,8 +1643,186 @@ ihk_mc_user_context_t *lookup_user_context(struct thread *thread)
return uctx;
} /* lookup_user_context() */
void zero_tsc(void)
extern long do_arch_prctl(unsigned long code, unsigned long address);
void
ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
unsigned long tls_base_addr)
{
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
}
/*@
@ assigns \nothing;
@*/
void init_tick(void)
{
dkprintf("init_tick():\n");
return;
}
/*@
@ assigns \nothing;
@*/
void init_delay(void)
{
dkprintf("init_delay():\n");
return;
}
/*@
@ assigns \nothing;
@*/
void sync_tick(void)
{
dkprintf("sync_tick():\n");
return;
}
static int is_pvclock_available(void)
{
uint32_t eax;
uint32_t ebx;
uint32_t ecx;
uint32_t edx;
dkprintf("is_pvclock_available()\n");
#define KVM_CPUID_SIGNATURE 0x40000000
asm ("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "a" (KVM_CPUID_SIGNATURE));
if ((eax && (eax < 0x40000001))
|| (ebx != 0x4b4d564b)
|| (ecx != 0x564b4d56)
|| (edx != 0x0000004d)) {
dkprintf("is_pvclock_available(): false (not kvm)\n");
return 0;
}
#define KVM_CPUID_FEATURES 0x40000001
asm ("cpuid" : "=a"(eax)
: "a"(KVM_CPUID_FEATURES)
: "%ebx", "%ecx", "%edx");
#define KVM_FEATURE_CLOCKSOURCE2 3
if (eax & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
pvti_msr = MSR_KVM_SYSTEM_TIME_NEW;
dkprintf("is_pvclock_available(): true (new)\n");
return 1;
}
#define KVM_FEATURE_CLOCKSOURCE 0
else if (eax & (1 << KVM_FEATURE_CLOCKSOURCE)) {
#define MSR_KVM_SYSTEM_TIME 0x12
pvti_msr = MSR_KVM_SYSTEM_TIME;
dkprintf("is_pvclock_available(): true (old)\n");
return 1;
}
dkprintf("is_pvclock_available(): false (not supported)\n");
return 0;
} /* is_pvclock_available() */
int arch_setup_pvclock(void)
{
size_t size;
int npages;
dkprintf("arch_setup_pvclock()\n");
if (!is_pvclock_available()) {
dkprintf("arch_setup_pvclock(): not supported\n");
return 0;
}
size = num_processors * sizeof(*pvti);
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
}
memset(pvti, 0, PAGE_SIZE*npages);
dkprintf("arch_setup_pvclock(): ok\n");
return 0;
} /* arch_setup_pvclock() */
void arch_start_pvclock(void)
{
int cpu;
intptr_t phys;
dkprintf("arch_start_pvclock()\n");
if (!pvti) {
dkprintf("arch_start_pvclock(): not supported\n");
return;
}
cpu = ihk_mc_get_processor_id();
phys = virt_to_phys(&pvti[cpu]);
#define KVM_SYSTEM_TIME_ENABLE 0x1
wrmsr(pvti_msr, phys|KVM_SYSTEM_TIME_ENABLE);
dkprintf("arch_start_pvclock(): ok\n");
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {
static const char signature[12] = "KVMKVMKVM\0\0";
const uint32_t *sigptr = (const uint32_t *)signature;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
op = KVM_CPUID_SIGNATURE;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (op));
if (ebx == sigptr[0] && ecx == sigptr[1] && edx == sigptr[2]) {
return 1;
}
return 0;
}
/*** end of file ***/

View File

@ -271,6 +271,17 @@ void fill_note(void *note, struct thread *thread, void *regs)
* should be zero.
*/
/*@
@ requires \valid(thread);
@ requires \valid(regs);
@ requires \valid(coretable);
@ requires \valid(chunks);
@ behavior success:
@ ensures \result == 0;
@ assigns coretable;
@ behavior failure:
@ ensures \result == -1;
@*/
int gencore(struct thread *thread, void *regs,
struct coretable **coretable, int *chunks)
{
@ -510,6 +521,10 @@ int gencore(struct thread *thread, void *regs,
* \param coretable An array of core chunks.
*/
/*@
@ requires \valid(coretable);
@ assigns \nothing;
@*/
void freecore(struct coretable **coretable)
{
struct coretable *ct = *coretable;

View File

@ -0,0 +1,96 @@
/**
* \file arch-bitops.h
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#endif

View File

@ -0,0 +1,67 @@
/**
* \file futex.h
* Licence details are found in the file LICENSE.
*
* \brief
* Futex adaptation to McKernel
*
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
*
*
* HISTORY:
*
*/
#ifndef _ARCH_FUTEX_H
#define _ARCH_FUTEX_H
#include <asm.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -248,6 +248,7 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
struct mcs_rwlock_node *p;
struct mcs_rwlock_node *f = NULL;
struct mcs_rwlock_node *n;
int breakf = 0;
ihk_atomic_inc(&lock->reader.count); // protect to unlock reader
for(p = &lock->reader; p->next; p = n){
@ -268,6 +269,9 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
}
p->next = n->next;
}
else{
breakf = 1;
}
}
else if(p->next == NULL){
while (n->next == NULL) {
@ -282,6 +286,8 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
else
f = n;
n = p;
if(breakf)
break;
}
if(n->next == NULL && lock->node != n){
while (n->next == NULL && lock->node != n) {
@ -340,6 +346,24 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock_noirq\n", ihk_mc_get_processor_id());
#else
#define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq
#endif
static inline unsigned int
atomic_inc_ifnot0(ihk_atomic_t *v)
{
unsigned int *p = (unsigned int *)(&(v)->counter);
unsigned int old;
unsigned int new;
unsigned int val;
do{
if(!(old = *p))
break;
new = old + 1;
val = atomic_cmpxchg4(p, old, new);
}while(val != old);
return old;
}
static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
@ -356,7 +380,7 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
if (pred) {
if(pred == &lock->reader){
if(ihk_atomic_inc_return(&pred->count) != 1){
if(atomic_inc_ifnot0(&pred->count)){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
@ -372,12 +396,12 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
cpu_pause();
}
pred->next = node->next;
if(node->next->type == MCS_RWLOCK_TYPE_READER)
mcs_rwlock_unlock_readers(lock);
node->locked = MCS_RWLOCK_LOCKED;
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
ihk_atomic_dec(&pred->count);
goto out;
}
ihk_atomic_dec(&pred->count);
}
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;

View File

@ -22,6 +22,7 @@
#define USER_CS_ENTRY 6
#define USER_DS_ENTRY 7
#define GLOBAL_TSS_ENTRY 8
#define GETCPU_ENTRY 15
#define KERNEL_CS (KERNEL_CS_ENTRY * 8)
#define KERNEL_DS (KERNEL_DS_ENTRY * 8)
@ -40,10 +41,12 @@
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
#define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
@ -65,6 +68,8 @@
#define PF_PRESENT ((pte_t)0x01) /* entry is valid */
#define PF_WRITABLE ((pte_t)0x02)
#define PFLX_PWT ((pte_t)0x08)
#define PFLX_PCD ((pte_t)0x10)
#define PF_SIZE ((pte_t)0x80) /* entry points large page */
#define PFL4_PRESENT ((pte_t)0x01)
@ -74,8 +79,8 @@
#define PFL3_PRESENT ((pte_t)0x01)
#define PFL3_WRITABLE ((pte_t)0x02)
#define PFL3_USER ((pte_t)0x04)
#define PFL3_PWT ((pte_t)0x08)
#define PFL3_PCD ((pte_t)0x10)
#define PFL3_PWT PFLX_PWT
#define PFL3_PCD PFLX_PCD
#define PFL3_ACCESSED ((pte_t)0x20)
#define PFL3_DIRTY ((pte_t)0x40)
#define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */
@ -86,8 +91,8 @@
#define PFL2_PRESENT ((pte_t)0x01)
#define PFL2_WRITABLE ((pte_t)0x02)
#define PFL2_USER ((pte_t)0x04)
#define PFL2_PWT ((pte_t)0x08)
#define PFL2_PCD ((pte_t)0x10)
#define PFL2_PWT PFLX_PWT
#define PFL2_PCD PFLX_PCD
#define PFL2_ACCESSED ((pte_t)0x20)
#define PFL2_DIRTY ((pte_t)0x40)
#define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */
@ -98,8 +103,8 @@
#define PFL1_PRESENT ((pte_t)0x01)
#define PFL1_WRITABLE ((pte_t)0x02)
#define PFL1_USER ((pte_t)0x04)
#define PFL1_PWT ((pte_t)0x08)
#define PFL1_PCD ((pte_t)0x10)
#define PFL1_PWT PFLX_PWT
#define PFL1_PCD PFLX_PCD
#define PFL1_ACCESSED ((pte_t)0x20)
#define PFL1_DIRTY ((pte_t)0x40)
#define PFL1_IGNORED_11 ((pte_t)1 << 11)
@ -152,6 +157,8 @@ enum ihk_mc_pt_attribute {
PTATTR_WRITE_COMBINED = 0x40000,
};
enum ihk_mc_pt_attribute attr_mask;
static inline int pte_is_null(pte_t *ptep)
{
return (*ptep == PTE_NULL);
@ -207,6 +214,27 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize)
return (off_t)(*ptep & PAGE_MASK);
}
static inline enum ihk_mc_pt_attribute pte_get_attr(pte_t *ptep, size_t pgsize)
{
enum ihk_mc_pt_attribute attr;
attr = *ptep & attr_mask;
if (*ptep & PFLX_PWT) {
if (*ptep & PFLX_PCD) {
attr |= PTATTR_UNCACHABLE;
}
else {
attr |= PTATTR_WRITE_COMBINED;
}
}
if (((pgsize == PTL2_SIZE) && (*ptep & PFL2_SIZE))
|| ((pgsize == PTL3_SIZE) && (*ptep & PFL3_SIZE))) {
attr |= PTATTR_LARGEPAGE;
}
return attr;
} /* pte_get_attr() */
static inline void pte_make_null(pte_t *ptep, size_t pgsize)
{
*ptep = PTE_NULL;
@ -290,5 +318,5 @@ extern unsigned long ap_trampoline;
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif

View File

@ -0,0 +1,18 @@
/**
* \file auxvec.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent constants for auxiliary vector
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_AUXVEC_H
#define ARCH_AUXVEC_H
#define AT_SYSINFO_EHDR 33
#endif

View File

@ -25,4 +25,13 @@ static inline void wmb(void)
barrier();
}
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
#endif /* ARCH_CPU_H */

View File

@ -0,0 +1,16 @@
#ifndef __ARCH_MM_H
#define __ARCH_MM_H
struct process_vm;
static inline void
flush_nfo_tlb()
{
}
static inline void
flush_nfo_tlb_mm(struct process_vm *vm)
{
}
#endif

View File

@ -27,6 +27,10 @@
#define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
/*
* for mlockall()
*/

View File

@ -13,6 +13,11 @@
#ifndef HEADER_ARCH_SHM_H
#define HEADER_ARCH_SHM_H
/* shmflg */
#define SHM_HUGE_SHIFT 26
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
struct ipc_perm {
key_t key;
uid_t uid;
@ -34,7 +39,8 @@ struct shmid_ds {
pid_t shm_cpid;
pid_t shm_lpid;
uint64_t shm_nattch;
uint8_t padding[16];
uint8_t padding[12];
int init_pgshift;
};
#endif /* HEADER_ARCH_SHM_H */

View File

@ -22,7 +22,7 @@
* - 4096 : kernel stack
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_CPU_LOCAL_OFFSET_KSTACK 16
#define X86_CPU_LOCAL_OFFSET_USTACK 24
@ -39,13 +39,13 @@ struct x86_cpu_local_variables {
struct x86_desc_ptr gdt_ptr;
unsigned short pad[3];
/* 48 */
uint64_t gdt[10];
/* 128 */
uint64_t gdt[16];
/* 176 */
struct tss64 tss;
/* 232 */
/* 280 */
unsigned long paniced;
uint64_t panic_regs[21];
/* 408 */
/* 456 */
} __attribute__((packed));
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id);

View File

@ -1,40 +1,7 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#ifndef _ERRNO_BASE_H
#define _ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#include <generic-errno.h>
#define EDEADLK 35 /* Resource deadlock would occur */
#define ENAMETOOLONG 36 /* File name too long */
@ -141,29 +108,4 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif

View File

@ -202,4 +202,17 @@ static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
return oldval;
}
static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
unsigned int oldval,
unsigned int newval)
{
asm volatile("lock; cmpxchgl %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -31,9 +31,5 @@ typedef int64_t off_t;
#define NULL ((void *)0)
#define BITS_PER_LONG_SHIFT 6
#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT)
#endif

View File

@ -0,0 +1,17 @@
/**
* \file prctl.h
* License details are found in the file LICENSE.
*/
/*
* HISTORY
*/
#ifndef __ARCH_PRCTL_H
#define __ARCH_PRCTL_H
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#endif

View File

@ -90,10 +90,6 @@ enum __rlimit_resource
#define RLIM_NLIMITS __RLIM_NLIMITS
};
struct rlimit {
uint64_t rlim_cur; /* Soft limit */
uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#include <generic-rlimit.h>
#endif

View File

@ -20,7 +20,7 @@
* syscall_name[] only, no handler exists.
*/
SYSCALL_DELEGATED(0, read)
SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open)
SYSCALL_HANDLED(3, close)
@ -35,7 +35,7 @@ SYSCALL_HANDLED(12, brk)
SYSCALL_HANDLED(13, rt_sigaction)
SYSCALL_HANDLED(14, rt_sigprocmask)
SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_DELEGATED(16, ioctl)
SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev)
@ -51,6 +51,8 @@ SYSCALL_HANDLED(30, shmat)
SYSCALL_HANDLED(31, shmctl)
SYSCALL_HANDLED(34, pause)
SYSCALL_HANDLED(35, nanosleep)
SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork)
@ -64,11 +66,13 @@ SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_HANDLED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace)
SYSCALL_HANDLED(102, getuid)
SYSCALL_HANDLED(104, getgid)
@ -107,6 +111,7 @@ SYSCALL_HANDLED(158, arch_prctl)
SYSCALL_HANDLED(160, setrlimit)
SYSCALL_HANDLED(164, settimeofday)
SYSCALL_HANDLED(186, gettid)
SYSCALL_HANDLED(200, tkill)
SYSCALL_DELEGATED(201, time)
SYSCALL_HANDLED(202, futex)
SYSCALL_HANDLED(203, sched_setaffinity)
@ -116,6 +121,7 @@ SYSCALL_HANDLED(216, remap_file_pages)
SYSCALL_DELEGATED(217, getdents64)
SYSCALL_HANDLED(218, set_tid_address)
SYSCALL_DELEGATED(220, semtimedop)
SYSCALL_HANDLED(228, clock_gettime)
SYSCALL_DELEGATED(230, clock_nanosleep)
SYSCALL_HANDLED(231, exit_group)
SYSCALL_DELEGATED(232, epoll_wait)
@ -132,6 +138,7 @@ SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4)
SYSCALL_HANDLED(298, perf_event_open)
#ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call)
#endif

View File

@ -13,7 +13,7 @@
* 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)
@ -209,7 +209,9 @@ enter_user_mode:
callq release_runq_lock
movq $0, %rdi
movq %rsp, %rsi
call check_signal
call check_signal
movq $0, %rdi
call set_cputime
POP_ALL_REGS
addq $8, %rsp
iretq

View File

@ -38,6 +38,11 @@ void init_processors_local(int max_id)
kprintf("locals = %p\n", locals);
}
/*@
@ requires \valid(id);
@ ensures \result == locals + (LOCALS_SPAN * id);
@ assigns \nothing;
@*/
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id)
{
return (struct x86_cpu_local_variables *)
@ -98,6 +103,10 @@ void init_boot_processor_local(void)
}
/** IHK **/
/*@
@ ensures \result == %gs;
@ assigns \nothing;
*/
int ihk_mc_get_processor_id(void)
{
int id;
@ -107,6 +116,10 @@ int ihk_mc_get_processor_id(void)
return id;
}
/*@
@ ensures \result == (locals + (LOCALS_SPAN * %gs))->apic_id;
@ assigns \nothing;
*/
int ihk_mc_get_hardware_processor_id(void)
{
struct x86_cpu_local_variables *v = get_x86_this_cpu_local();

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler);
ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel);
/* Set boot parameter */

View File

@ -12,19 +12,72 @@
#include <errno.h>
#include <ihk/debug.h>
#include <registers.h>
#include <mc_perf_event.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
void x86_init_perfctr(void)
{
int i = 0;
unsigned long reg;
unsigned long value = 0;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
/* Do not do it on KVM */
if (running_on_kvm()) return;
/* Allow PMC to be read from user space */
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Detect number of supported performance counters */
if (!perf_counters_discovered) {
/* See Table 35.2 - Architectural MSRs in Vol 3C */
op = 0x0a;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
X86_IA32_FIXED_PERF_COUNTERS_MASK =
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
X86_IA32_BASE_FIXED_PERF_COUNTERS;
perf_counters_discovered = 1;
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
}
/* Clear Fixed Counter Control */
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= 0xfffffffffffff000L;
wrmsr(MSR_PERF_FIXED_CTRL, value);
/* Clear Generic Counter Control */
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -33,20 +86,53 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
return -EINVAL;
}
if (mode & PERFCTR_USER_MODE) {
// clear mode flags
value &= ~(3 << 16);
// set mode flags
if(mode & PERFCTR_USER_MODE) {
value |= 1 << 16;
}
if (mode & PERFCTR_KERNEL_MODE) {
}
if(mode & PERFCTR_KERNEL_MODE) {
value |= 1 << 17;
}
}
// wrmsr(MSR_PERF_GLOBAL_CTRL, 0);
value |= (1 << 22) | (1 << 18); /* EN */
value |= (1 << 20); /* Enable overflow interrupt */
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, long val)
{
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
val &= 0x000000ffffffffff; // 40bit Mask
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
wrmsr(MSR_IA32_PMC0 + counter, val);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// set fixed pmc
wrmsr(MSR_IA32_FIXED_CTR0 + counter - X86_IA32_BASE_FIXED_PERF_COUNTERS, val);
}
else {
return -EINVAL;
}
return 0;
}
@ -57,6 +143,45 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
CVAL2(event, mask, inv, count));
}
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, code);
}
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
@ -78,14 +203,15 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
unsigned int value = 0;
unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
#ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask);
#endif
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask;
value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0;
@ -93,25 +219,78 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
int ihk_mc_perfctr_stop(unsigned long counter_mask)
{
unsigned int value;
unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
if(counter_mask >> 32 & 0x1) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x2) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 4);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x4) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
}
// init for fixed counter
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
// enable PMI on overflow
set_val |= 1 << 3;
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_reset(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_pmc_x86_direct(counter, 0);
}
wrmsr(MSR_IA32_PMC0 + counter, 0);
return 0;
int ihk_mc_perfctr_set(int counter, long val)
{
return set_pmc_x86_direct(counter, val);
}
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
@ -129,10 +308,87 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
unsigned long ihk_mc_perfctr_read(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
return rdpmc(counter);
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
retval = rdpmc(counter);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
retval = rdpmc((1 << 30) + (counter - X86_IA32_BASE_FIXED_PERF_COUNTERS));
}
else {
retval = -EINVAL;
}
return retval;
}
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
unsigned int idx = 0;
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
idx = MSR_IA32_PMC0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
idx = MSR_IA32_FIXED_CTR0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else {
retval = -EINVAL;
}
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
{
int ret = -1;
int i = 0;
if(*type == PERF_TYPE_HARDWARE) {
switch(*config){
case PERF_COUNT_HW_INSTRUCTIONS :
*type = PERF_TYPE_RAW;
*config = 0x5300c0;
break;
default :
// Unexpected config
return -1;
}
}
else if(*type != PERF_TYPE_RAW) {
return -1;
}
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
break;
}
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -13,15 +13,90 @@
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
while getopts :i:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value" >&2
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
;;
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
@ -30,17 +105,30 @@ if [ "$cpus" == "" ]; then
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?" >&2; exit 1; fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay" >&2; exit 1; fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then echo "error: stopping irqbalance" >&2; exit 1; fi;
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk" >&2; exit 1; fi;
fi
# Load IHK-SMP if not loaded and reserve CPUs and memory
@ -52,47 +140,111 @@ if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86"; exit; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available" >&2; exit 1; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86" >&2; exit 1; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs" >&2; exit 1; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory" >&2; exit 1; fi
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko" >&2; exit 1; fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed" >&2; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs hidos; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create" >&2; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments" >&2; exit 1; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting" >&2; exit 1; fi
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then echo "error: chowning device files" >&2; exit 1; fi
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos" >&2; exit 1; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc" >&2; exit 1; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko" >&2; exit 1; fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc" >&2; exit 1; fi
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys" >&2; exit 1; fi
mount --make-rprivate /sys
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
fi
done
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
# mcklogd survives when McKernel isn't shut down by mcstop+release.sh
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }' ; then echo "error: saving /proc/irq/*/smp_affinity" >&2; exit 1; fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then echo "error: modifying /proc/irq/*/smp_affinity" >&2; exit 1; fi;
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: linking irqbalance_mck" >&2; exit 1; fi;
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then echo "error: starting irqbalance_mck" >&2; exit 1; fi;
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -10,6 +10,7 @@
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
@ -17,31 +18,47 @@ mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit; fi
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed" >&2; exit 1; fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus" >&2; exit 1; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs" >&2; exit 1; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory" >&2; exit 1; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory" >&2; exit 1; fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
if ! rmmod mcctrl; then echo "error: removing mcctrl" >&2; exit 1; fi
fi
# Destroy all LWK instances
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed"; exit; fi
done
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory"; exit; fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86"; exit; fi
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86" >&2; exit 1; fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then echo "error: removing ihk" >&2; exit 1; fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null ; then echo "error: stopping irqbalance_mck" >&2; exit 1; fi;
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then echo "error: disabling irqbalance_mck" >&2; exit 1; fi;
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }' ; then echo "error: restoring /proc/irq/*/smp_affinity" >&2; exit 1; fi;
if ! systemctl start irqbalance.service; then echo "error: starting irqbalance" >&2; exit 1; fi;
fi

2482
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,27 @@ AC_ARG_WITH([target],
[--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]),
[WITH_TARGET=$withval],[WITH_TARGET=yes])
AC_ARG_WITH([system_map],
AS_HELP_STRING(
[--with-system_map=path],[Path to 'System.map file', default is /boot/System.map-uname_r]),
[WITH_SYSTEM_MAP=$withval],[WITH_SYSTEM_MAP=yes])
AC_ARG_ENABLE([dcfa],
[AS_HELP_STRING(
[--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no])
AC_ARG_ENABLE([memdump],
AC_HELP_STRING([--enable-memdump],
[enable dumping memory and analyzing a dump]),
[ENABLE_MEMDUMP=$enableval],
[ENABLE_MEMDUMP=default])
AC_ARG_ENABLE([mcoverlayfs],
AC_HELP_STRING([--enable-mcoverlayfs],
[enable mcoverlayfs implementation]),
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
case "X$WITH_KERNELSRC" in
Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@ -49,9 +66,26 @@ fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
case $WITH_TARGET in
attached-mic)
attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
esac
case $WITH_TARGET in
attached-mic)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -69,12 +103,6 @@ case $WITH_TARGET in
fi
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -92,9 +120,6 @@ case $WITH_TARGET in
fi
;;
builtin-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -112,9 +137,6 @@ case $WITH_TARGET in
fi
;;
smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/smp-x86/kernel"
fi
@ -124,6 +146,9 @@ case $WITH_TARGET in
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
@ -139,6 +164,116 @@ esac
KDIR="$WITH_KERNELSRC"
TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
case "X$WITH_SYSTEM_MAP" in
Xyes | Xno | X)
MCCTRL_LINUX_SYMTAB=""
;;
*)
MCCTRL_LINUX_SYMTAB="$WITH_SYSTEM_MAP"
;;
esac
AC_MSG_CHECKING([[for System.map]])
if test -f "$MCCTRL_LINUX_SYMTAB"; then
MCCTRL_LINUX_SYMTAB="$MCCTRL_LINUX_SYMTAB"
elif test -f "/boot/System.map-`uname -r`"; then
MCCTRL_LINUX_SYMTAB="/boot/System.map-`uname -r`"
elif test -f "$KDIR/System.map"; then
MCCTRL_LINUX_SYMTAB="$KDIR/System.map"
fi
if test "$MCCTRL_LINUX_SYMTAB" == ""; then
AC_MSG_ERROR([could not find])
fi
if test -z "`eval cat $MCCTRL_LINUX_SYMTAB`"; then
AC_MSG_ERROR([could not read System.map file, no read permission?])
fi
AC_MSG_RESULT([$MCCTRL_LINUX_SYMTAB])
MCCTRL_LINUX_SYMTAB_CMD="cat $MCCTRL_LINUX_SYMTAB"
# MCCTRL_FIND_KSYM(SYMBOL)
# ------------------------------------------------------
# Search System.map for address of the given symbol and
# do one of three things in config.h:
# If not found, leave MCCTRL_KSYM_foo undefined
# If found to be exported, "#define MCCTRL_KSYM_foo 0"
# If found not to be exported, "#define MCCTRL_KSYM_foo 0x<value>"
AC_DEFUN([MCCTRL_FIND_KSYM],[
AC_MSG_CHECKING([[System.map for symbol $1]])
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " $1\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
AC_MSG_RESULT([not found])
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
m4_ifval([$2],[],[
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_$1\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
])
AC_MSG_RESULT([$mcctrl_result])
AC_DEFINE_UNQUOTED(MCCTRL_KSYM_[]$1,$mcctrl_addr,[Define to address of kernel symbol $1, or 0 if exported])
fi
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
MCCTRL_FIND_KSYM([vdso_start])
MCCTRL_FIND_KSYM([vdso_end])
MCCTRL_FIND_KSYM([vdso_pages])
MCCTRL_FIND_KSYM([__vvar_page])
MCCTRL_FIND_KSYM([hpet_address])
MCCTRL_FIND_KSYM([hv_clock])
MCCTRL_FIND_KSYM([sys_readlink])
case $ENABLE_MEMDUMP in
yes|no|auto)
;;
default)
if test "x$WITH_TARGET" = "xsmp-x86" ; then
ENABLE_MEMDUMP=auto
else
ENABLE_MEMDUMP=no
fi
;;
*)
AC_MSG_ERROR([unknown memdump argument: $ENABLE_MEMDUMP])
;;
esac
if test "x$ENABLE_MEMDUMP" != "xno" ; then
enableval=yes
AC_CHECK_LIB([bfd],[bfd_init],[],[enableval=no])
AC_CHECK_HEADER([bfd.h],[],[enableval=no])
if test "x$ENABLE_MEMDUMP" = "xyes" -a "x$enableval" = "xno" ; then
AC_MSG_ERROR([memdump feature needs bfd.h and libbfd a.k.a bunutils-devel])
fi
ENABLE_MEMDUMP=$enableval
fi
if test "x$ENABLE_MEMDUMP" = "xyes" ; then
AC_MSG_NOTICE([memdump feature is enabled])
AC_DEFINE([ENABLE_MEMDUMP],[1],[whether memdump feature is enabled])
uncomment_if_ENABLE_MEMDUMP=''
else
AC_MSG_NOTICE([memdump feature is disabled])
uncomment_if_ENABLE_MEMDUMP='#'
fi
if test "x$ENABLE_MCOVERLAYFS" = "xyes" ; then
AC_DEFINE([ENABLE_MCOVERLAYFS],[1],[whether mcoverlayfs is enabled])
AC_MSG_NOTICE([mcoverlayfs is enabled])
else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
@ -146,9 +281,11 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION)
@ -156,11 +293,17 @@ AC_SUBST(DCFA_VERSION)
AC_SUBST(IHK_RELEASE_DATE)
AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_FILES([
Makefile
executer/user/Makefile
executer/kernel/Makefile
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
@ -170,6 +313,8 @@ AC_CONFIG_FILES([
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
])
AS_IF([test "x$enable_dcfa" = xyes], [

91
executer/config.h.in Normal file
View File

@ -0,0 +1,91 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bfd' library (-lbfd). */
#undef HAVE_LIBBFD
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to address of kernel symbol __vvar_page, or 0 if exported */
#undef MCCTRL_KSYM___vvar_page
/* Define to address of kernel symbol hpet_address, or 0 if exported */
#undef MCCTRL_KSYM_hpet_address
/* Define to address of kernel symbol hv_clock, or 0 if exported */
#undef MCCTRL_KSYM_hv_clock
/* Define to address of kernel symbol sys_mount, or 0 if exported */
#undef MCCTRL_KSYM_sys_mount
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
/* Define to address of kernel symbol vdso_end, or 0 if exported */
#undef MCCTRL_KSYM_vdso_end
/* Define to address of kernel symbol vdso_image_64, or 0 if exported */
#undef MCCTRL_KSYM_vdso_image_64
/* Define to address of kernel symbol vdso_pages, or 0 if exported */
#undef MCCTRL_KSYM_vdso_pages
/* Define to address of kernel symbol vdso_start, or 0 if exported */
#undef MCCTRL_KSYM_vdso_start
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

View File

@ -48,6 +48,9 @@
#define MCEXEC_UP_OPEN_EXEC 0x30a02912
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_DEBUG_LOG 0x40000000
#define MCEXEC_UP_TRANSFER_TO_REMOTE 0
@ -83,6 +86,9 @@ struct program_load_desc {
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -104,6 +110,13 @@ struct program_load_desc {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
@ -122,8 +135,17 @@ struct syscall_load_desc {
unsigned long size;
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -166,4 +188,16 @@ struct newprocess_desc {
int pid;
};
struct sys_mount_desc {
char *dev_name;
char *dir_name;
char *type;
unsigned long flags;
void *data;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
#endif

View File

@ -1,26 +0,0 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/include -I$(src)/../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\"
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -1,191 +0,0 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <ihk/ihk_host_driver.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
int pid;
int req;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
};
struct mcctrl_per_proc_data {
struct list_head list;
int pid;
unsigned long rpgtable; /* per process, not per OS */
};
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
void **keys;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
#define PROCFS_NAME_MAX 1000
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
#endif

View File

@ -0,0 +1,27 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1 @@
# dummy file

View File

@ -0,0 +1,192 @@
#include <linux/version.h>
#include "../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
#if MCCTRL_KSYM_vdso_image_64
struct vdso_image *vdso_image = (void *)MCCTRL_KSYM_vdso_image_64;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_start
#if MCCTRL_KSYM_vdso_start
void *vdso_start = (void *)MCCTRL_KSYM_vdso_start;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_end
#if MCCTRL_KSYM_vdso_end
void *vdso_end = (void *)MCCTRL_KSYM_vdso_end;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_pages
#if MCCTRL_KSYM_vdso_pages
struct page **vdso_pages = (void *)MCCTRL_KSYM_vdso_pages;
#endif
#endif
#ifdef MCCTRL_KSYM___vvar_page
#if MCCTRL_KSYM___vvar_page
void *__vvar_page = (void *)MCCTRL_KSYM___vvar_page;
#endif
#endif
long *hpet_addressp
#ifdef MCCTRL_KSYM_hpet_address
#if MCCTRL_KSYM_hpet_address
= (void *)MCCTRL_KSYM_hpet_address;
#else
= &hpet_address;
#endif
#else
= NULL;
#endif
void **hv_clockp
#ifdef MCCTRL_KSYM_hv_clock
#if MCCTRL_KSYM_hv_clock
= (void *)MCCTRL_KSYM_hv_clock;
#else
= &hv_clock;
#endif
#else
= NULL;
#endif
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
int
reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
{
struct vm_area_struct *vma;
unsigned long start = 0L;
unsigned long end;
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
start = reserve_user_space_common(usrdata, start, end);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
if (IS_ERR_VALUE(start)) {
return start;
}
*startp = start;
*endp = end;
return 0;
}
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
long vdso_pa;
struct vdso *vdso;
size_t size;
int i;
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = virt_to_phys(
vdso_image->data + (i * PAGE_SIZE));
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
size = vdso_end - vdso_start;
size = (size + PAGE_SIZE - 1) & PAGE_MASK;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]);
}
#endif
/* VVAR page */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-3 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-2 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
vdso->vvar_is_global = 1;
vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#endif
/* HPET page */
if (hpet_addressp && *hpet_addressp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-2 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-1 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
vdso->hpet_is_global = 1;
vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET);
vdso->hpet_phys = *hpet_addressp;
#endif
}
/* struct pvlock_vcpu_time_info table */
if (hv_clockp && *hv_clockp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->pvti_is_global = 0;
vdso->pvti_virt = (void *)(-1 * PAGE_SIZE);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
vdso->pvti_is_global = 1;
vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#endif
}
out:
wmb();
vdso->busy = 0;
ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso));
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */

View File

@ -45,7 +45,6 @@ static int load_elf(struct linux_binprm *bprm
#endif
)
{
char mcexec[BINPRM_BUF_SIZE];
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
const
#endif
@ -60,12 +59,8 @@ static int load_elf(struct linux_binprm *bprm
int l;
} envdata;
envdata env[] = {
{.name = "MCEXEC"},
#define env_mcexec (env[0].val)
{.name = "MCEXEC_WL"},
#define env_mcexec_wl (env[1].val)
{.name = "MCEXEC_BL"},
#define env_mcexec_bl (env[2].val)
#define env_mcexec_wl (env[0].val)
{.name = NULL}
};
envdata *ep;
@ -120,9 +115,15 @@ static int load_elf(struct linux_binprm *bprm
for(i = 0, st = 0; mode != 2;){
if(st == 0){
off = p & ~PAGE_MASK;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
rc = get_user_pages_remote(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#else
rc = get_user_pages(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
return -EFAULT;
addr = kmap_atomic(page
@ -190,23 +191,10 @@ static int load_elf(struct linux_binprm *bprm
}
}
if(!env_mcexec || !strcmp(env_mcexec, "0") || !strcmp(env_mcexec, "off"))
rc = 1;
else{
rc = 0;
if(strchr(env_mcexec, '/') && strlen(env_mcexec) < BINPRM_BUF_SIZE)
strcpy(mcexec, env_mcexec);
else
strcpy(mcexec, MCEXEC_PATH);
}
if(rc);
else if(env_mcexec_wl)
if(env_mcexec_wl)
rc = !pathcheck(path, env_mcexec_wl);
else if(env_mcexec_bl)
rc = pathcheck(path, env_mcexec_bl);
else
rc = pathcheck(path, "/usr:/bin:/sbin:/opt");
rc = 1;
for(ep = env; ep->name; ep++)
if(ep->val)
@ -214,7 +202,7 @@ static int load_elf(struct linux_binprm *bprm
if(rc)
return -ENOEXEC;
file = open_exec(mcexec);
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
return -ENOEXEC;
@ -229,29 +217,18 @@ static int load_elf(struct linux_binprm *bprm
return rc;
}
bprm->argc++;
wp = mcexec;
wp = MCEXEC_PATH;
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
return rc;
}
bprm->argc++;
#if 1
rc = bprm_change_interp(mcexec, bprm);
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
return rc;
}
#else
if(brpm->interp != bprm->filename)
kfree(brpm->interp);
kfree(brpm->filename);
bprm->filename = bprm->interp = kstrdup(mcexec, GFP_KERNEL);
if(!bprm->interp){
fput(file);
return -ENOMEM;
}
#endif
allow_write_access(bprm->file);
fput(bprm->file);
@ -278,7 +255,7 @@ void __init binfmt_mcexec_init(void)
insert_binfmt(&mcexec_format);
}
void __exit binfmt_mcexec_exit(void)
void binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

View File

@ -32,10 +32,12 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/version.h>
#include <linux/semaphore.h>
#include <linux/interrupt.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/msr.h>
#include <asm/io.h>
#include "../../config.h"
#include "mcctrl.h"
//#define DEBUG
@ -46,6 +48,28 @@
#define dprintk(...)
#endif
#ifdef MCCTRL_KSYM_sys_unshare
#if MCCTRL_KSYM_sys_unshare
typedef int (*int_star_fn_ulong_t)(unsigned long);
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) =
(int_star_fn_ulong_t)
MCCTRL_KSYM_sys_unshare;
#else // exported
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) = NULL;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_mount
#if MCCTRL_KSYM_sys_mount
typedef int (*int_star_fn_char_char_char_ulong_void_t)(char *, char *, char *, unsigned long, void *);
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) =
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
#endif
#endif
//static DECLARE_WAIT_QUEUE_HEAD(wq_prepare);
//extern struct mcctrl_channel *channels;
int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
@ -58,7 +82,6 @@ static long mcexec_prepare_image(ihk_os_t os,
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL;
if (copy_from_user(&desc, udesc,
@ -101,52 +124,48 @@ static long mcexec_prepare_image(ihk_os_t os,
}
pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %d\n", *(int*)args);
dprintk("args: 0x%lX\n", (unsigned long)pdesc->args);
dprintk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %d\n", *(int*)envs);
dprintk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
dprintk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu;
isp.arg = virt_to_phys(pdesc);
printk("# of sections: %d\n", pdesc->num_sections);
printk("%p (%lx)\n", pdesc, isp.arg);
dprintk("# of sections: %d\n", pdesc->num_sections);
dprintk("%p (%lx)\n", pdesc, isp.arg);
pdesc->status = 0;
mcctrl_ikc_send(os, pdesc->cpu, &isp);
wait_event_interruptible(usrdata->wq_prepare, pdesc->status);
while (wait_event_interruptible(usrdata->wq_prepare, pdesc->status) != 0);
if(pdesc->err < 0){
ret = pdesc->err;
goto free_out;
}
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC);
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
printk("ERROR: allocating per process data\n");
ret = -ENOMEM;
printk("ERROR: no per process data for PID %d\n", task_tgid_vnr(current));
ret = -EINVAL;
goto free_out;
}
ppd->pid = pdesc->pid;
/* Update rpgtable */
ppd->rpgtable = pdesc->rpgtable;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_add_tail(&ppd->list, &usrdata->per_proc_list);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
ret = -EFAULT;
goto free_out;
}
dprintk("%s: pid %d, rpgtable: 0x%lx added\n",
__FUNCTION__, ppd->pid, ppd->rpgtable);
ret = 0;
free_out:
@ -264,12 +283,15 @@ static void release_handler(ihk_os_t os, void *param)
{
struct handlerinfo *info = param;
struct ikc_scd_packet isp;
int os_ind = ihk_host_os_get_index(os);
memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_CLEANUP_PROCESS;
isp.pid = info->pid;
mcctrl_ikc_send(os, 0, &isp);
if(os_ind >= 0)
delete_pid_entry(os_ind, info->pid);
kfree(param);
}
@ -391,19 +413,200 @@ static long mcexec_get_cpu(ihk_os_t os)
return info->n_cpus;
}
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg)
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd)
{
struct mcctrl_per_proc_data *ppd_iter;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
/* Check if data for this thread exists and add if not */
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ret = -EBUSY;
goto out;
}
}
list_add_tail(&ppd->hash, &ud->per_proc_data_hash[hash]);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
int ret = 0;
unsigned long flags;
write_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
if (!ppd) {
ret = -EINVAL;
goto out;
}
list_del(&ppd->hash);
out:
write_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ret;
}
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid)
{
struct mcctrl_per_proc_data *ppd_iter, *ppd = NULL;
int hash = (pid & MCCTRL_PER_PROC_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this process exists and return it */
read_lock_irqsave(&ud->per_proc_data_hash_lock[hash], flags);
list_for_each_entry(ppd_iter, &ud->per_proc_data_hash[hash], hash) {
if (ppd_iter->pid == pid) {
ppd = ppd_iter;
break;
}
}
read_unlock_irqrestore(&ud->per_proc_data_hash_lock[hash], flags);
return ppd;
}
/*
* Called indirectly from the IKC message handler.
*/
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
{
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
struct wait_queue_head_list_node *wqhln_alloc = NULL;
int pid = packet->pid;
unsigned long flags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return 0;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
__FUNCTION__,
packet->req.rtid,
packet->req.ttid,
packet->req.number);
/*
* Three scenarios are possible:
* - Find the designated thread if req->ttid is specified.
* - Find any available thread if req->ttid is zero.
* - Add a request element if no threads are available.
*/
flags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Is this a request for a specific thread? See if it's waiting */
if (unlikely(packet->req.ttid)) {
list_for_each_entry(wqhln_iter, &ppd->wq_list_exact, list) {
if (packet->req.ttid != task_pid_vnr(wqhln_iter->task))
continue;
/* Look up per-process wait queue head with pid */
flags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == pid) {
wqhln = wqhln_iter;
break;
}
if (!wqhln) {
printk("%s: WARNING: no target thread found for exact request??\n",
__FUNCTION__);
}
}
/* Is there any thread available? */
else {
list_for_each_entry(wqhln_iter, &ppd->wq_list, list) {
if (wqhln_iter->task && !wqhln_iter->req) {
wqhln = wqhln_iter;
break;
}
}
}
/* If no match found, add request to pending request list */
if (unlikely(!wqhln)) {
retry_alloc:
wqhln_alloc = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln_alloc) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln = wqhln_alloc;
wqhln->req = 0;
wqhln->task = NULL;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &ppd->wq_req_list);
}
wqhln->packet = packet;
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
return 0;
}
/*
* Called from an mcexec thread via ioctl().
*/
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (packet) {
printk("%s: ERROR: packet %p is already registered for thread %d\n",
__FUNCTION__, packet, task_pid_vnr(current));
return -EBUSY;
}
retry:
/* Prepare per-thread wait queue head or find a valid request */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* First see if there is a valid request already that is not yet taken */
list_for_each_entry(wqhln_iter, &ppd->wq_req_list, list) {
if (wqhln_iter->task == NULL && wqhln_iter->req) {
wqhln = wqhln_iter;
wqhln->task = current;
list_del(&wqhln->list);
break;
}
}
if (!wqhln) {
@ -414,180 +617,86 @@ retry_alloc:
goto retry_alloc;
}
wqhln->pid = pid;
wqhln->task = current;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
/* Wait for a request.. */
list_add(&wqhln->list, &ppd->wq_list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
}
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
return 0;
}
#ifndef DO_USER_MODE
// static int remaining_job, base_cpu, job_pos;
#endif
// extern int num_channels;
// extern int mcctrl_dma_abort;
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct syscall_wait_desc swd;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
#ifndef DO_USER_MODE
unsigned long s, w, d;
#endif
//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu));
if (copy_from_user(&swd, req, sizeof(swd))) {
return -EFAULT;
}
if (swd.cpu >= usrdata->num_channels)
return -EINVAL;
c = get_peer_channel(usrdata, current);
if (c) {
printk("mcexec_wait_syscall:already registered. task %p ch %p\n",
current, c);
return -EBUSY;
}
c = usrdata->channels + swd.cpu;
#ifdef DO_USER_MODE
retry:
/* Prepare per-process wait queue head */
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = swd.pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == current->tgid) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &c->wq_list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
if (ret && !wqhln->req) {
kfree(wqhln);
wqhln = NULL;
return -EINTR;
}
packet = wqhln->packet;
kfree(wqhln);
wqhln = NULL;
if (c->param.request_va->number == 61 &&
c->param.request_va->args[0] == swd.pid) {
dprintk("%s: tid: %d request from CPU %d\n",
__FUNCTION__, task_pid_vnr(current), packet->ref);
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
current->tgid,
current->pid,
c->param.request_va->number,
swd.cpu);
return -EINTR;
}
#if 1
mb();
if (!c->param.request_va->valid) {
printk("mcexec_wait_syscall:stray wakeup\n");
if (!packet->req.valid) {
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
__FUNCTION__,
task_tgid_vnr(current),
task_pid_vnr(current),
packet->req.number);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
goto retry;
}
#endif
#else
while (1) {
c = usrdata->channels + swd.cpu;
rdtscll(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
rdtscll(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
}
d = (*c->param.doorbell_va) - 1;
*c->param.doorbell_va = 0;
if (d < 0 || d >= usrdata->num_channels) {
d = 0;
}
usrdata->base_cpu = d;
usrdata->job_pos = 0;
usrdata->remaining_job = 1;
} else {
usrdata->job_pos++;
}
for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) {
if (base_cpu + job_pos >= num_channels) {
c = usrdata->channels +
(usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels);
} else {
c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos;
}
if (!c) {
continue;
}
if (c->param.request_va &&
c->param.request_va->valid) {
#endif
c->param.request_va->valid = 0; /* ack */
dprintk("SC #%lx, %lx\n",
c->param.request_va->number,
c->param.request_va->args[0]);
register_peer_channel(usrdata, current, c);
if (__do_in_kernel_syscall(os, c, c->param.request_va)) {
if (copy_to_user(&req->sr, c->param.request_va,
sizeof(struct syscall_request))) {
deregister_peer_channel(usrdata, current, c);
return -EFAULT;
}
return 0;
}
deregister_peer_channel(usrdata, current, c);
#ifdef DO_USER_MODE
goto retry;
#endif
#ifndef DO_USER_MODE
if (usrdata->mcctrl_dma_abort) {
return -2;
}
}
}
usrdata->remaining_job = 0;
packet->req.valid = 0; /* ack */
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
__FUNCTION__,
packet->req.number,
packet->req.args[0],
packet->req.args[1],
packet->req.args[2],
packet->req.args[3],
packet->req.args[4],
packet->req.args[5]);
if (mcctrl_add_per_thread_data(ppd, current, packet) < 0) {
kprintf("%s: error adding per-thread data\n", __FUNCTION__);
return -EINVAL;
}
#endif
return 0;
if (__do_in_kernel_syscall(os, packet)) {
if (copy_to_user(&req->sr, &packet->req,
sizeof(struct syscall_request))) {
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
return -EFAULT;
}
return 0;
}
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
return -EINVAL;
}
goto retry;
}
long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg)
@ -670,33 +779,6 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size);
/*
ihk_dma_channel_t channel;
struct ihk_dma_request request;
unsigned long dma_status = 0;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = os;
request.src_phys = desc.src;
request.dest_os = NULL;
request.dest_phys = desc.dest;
request.size = desc.size;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
*/
return 0;
}
@ -704,80 +786,66 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
{
struct syscall_ret_desc ret;
struct mcctrl_channel *mc;
struct ikc_scd_packet *packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
#if 0
ihk_dma_channel_t channel;
struct ihk_dma_request request;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
struct mcctrl_per_proc_data *ppd;
if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) {
return -EFAULT;
}
mc = usrdata->channels + ret.cpu;
if (!mc) {
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
deregister_peer_channel(usrdata, current, mc);
mc->param.response_va->ret = ret.ret;
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
kprintf("%s: ERROR: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
return -EINVAL;
}
mcctrl_delete_per_thread_data(ppd, current);
if (ret.size > 0) {
/* Host => Accel. Write is fast. */
unsigned long phys;
void *rpm;
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest,
ret.size);
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest, ret.size);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, ret.size);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
ret.size, NULL, 0);
#endif
if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) {
return -EFAULT;
}
mb();
mc->param.response_va->status = 1;
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size);
}
/*
memset(&request, 0, sizeof(request));
request.src_os = NULL;
request.src_phys = ret.src;
request.dest_os = os;
request.dest_phys = ret.dest;
request.size = ret.size;
request.notify_os = os;
request.notify = (void *)mc->param.response_rpa;
request.priv = (void *)1;
ihk_dma_request(channel, &request);
*/
} else {
mb();
mc->param.response_va->status = 1;
}
__return_syscall(os, packet, ret.ret, task_pid_vnr(current));
/* Free packet */
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
return 0;
}
LIST_HEAD(mckernel_exec_files);
DEFINE_SPINLOCK(mckernel_exec_file_lock);
DEFINE_SEMAPHORE(mckernel_exec_file_lock);
struct mckernel_exec_file {
@ -834,11 +902,67 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
struct mckernel_exec_file *mcef;
struct mckernel_exec_file *mcef_iter;
int retval;
int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
int i;
if (os_ind < 0) {
return EINVAL;
}
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
ppd = kmalloc(sizeof(*ppd), GFP_KERNEL);
if (!ppd) {
printk("ERROR: allocating per process data\n");
return -ENOMEM;
}
ppd->pid = task_tgid_vnr(current);
/*
* XXX: rpgtable will be updated in __do_in_kernel_syscall()
* under case __NR_munmap
*/
INIT_LIST_HEAD(&ppd->wq_list);
INIT_LIST_HEAD(&ppd->wq_req_list);
INIT_LIST_HEAD(&ppd->wq_list_exact);
spin_lock_init(&ppd->wq_list_lock);
for (i = 0; i < MCCTRL_PER_THREAD_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&ppd->per_thread_data_hash[i]);
rwlock_init(&ppd->per_thread_data_hash_lock[i]);
}
if (mcctrl_add_per_proc_data(usrdata, ppd->pid, ppd) < 0) {
printk("%s: error adding per process data\n", __FUNCTION__);
retval = EINVAL;
goto out_free_ppd;
}
}
else {
/* Only deallocate in case of an error if we added it above */
ppd = NULL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
retval = ENOMEM;
goto out_error_drop_ppd;
}
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file)) {
goto out_return;
goto out_error_free;
}
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (IS_ERR(fullpath)) {
retval = PTR_ERR(fullpath);
goto out_error_free;
}
mcef = kmalloc(sizeof(*mcef), GFP_KERNEL);
@ -847,33 +971,43 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
goto out_put_file;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == current->tgid) {
if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
allow_write_access(mcef_iter->fp);
fput(mcef_iter->fp);
list_del(&mcef_iter->list);
kfree(mcef_iter);
dprintk("%d open_exec dropped previous executable \n", (int)current->tgid);
break;
}
}
/* Add new exec file to the list */
mcef->os = os;
mcef->pid = current->tgid;
mcef->pid = task_tgid_vnr(current);
mcef->fp = file;
list_add_tail(&mcef->list, &mckernel_exec_files);
spin_unlock(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)current->tgid, filename);
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
up(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current), filename);
kfree(pathbuf);
return 0;
out_put_file:
fput(file);
out_return:
out_error_free:
kfree(pathbuf);
out_error_drop_ppd:
if (ppd) mcctrl_delete_per_proc_data(usrdata, ppd->pid);
out_free_ppd:
if (ppd) kfree(ppd);
return -retval;
}
@ -882,20 +1016,43 @@ int mcexec_close_exec(ihk_os_t os)
{
struct mckernel_exec_file *mcef = NULL;
int found = 0;
int os_ind = ihk_host_os_get_index(os);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (ppd) {
mcctrl_delete_per_proc_data(usrdata, ppd->pid);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
task_tgid_vnr(current));
}
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irq(&mckernel_exec_file_lock);
down(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == current->tgid) {
if (mcef->os == os && mcef->pid == task_tgid_vnr(current)) {
allow_write_access(mcef->fp);
fput(mcef->fp);
list_del(&mcef->list);
kfree(mcef);
found = 1;
dprintk("%d close_exec dropped executable \n", (int)current->tgid);
dprintk("%d close_exec dropped executable \n", (int)task_tgid_vnr(current));
break;
}
}
spin_unlock(&mckernel_exec_file_lock);
up(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
}
@ -952,6 +1109,67 @@ long mcexec_strncpy_from_user(ihk_os_t os, struct strncpy_from_user_desc * __use
return 0;
}
long mcexec_sys_mount(struct sys_mount_desc *__user arg)
{
struct sys_mount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_unshare
ret = mcctrl_sys_unshare(desc.unshare_flags);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
struct file *file)
{
@ -1006,6 +1224,12 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_CREDV:
return mcexec_getcredv((int *)arg);
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);
case MCEXEC_UP_DEBUG_LOG:
return mcexec_debug_log(os, arg);
}

View File

@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h"
#define OS_MAX_MINOR 64
@ -67,6 +68,8 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
@ -79,71 +82,109 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
static int __init mcctrl_init(void)
ihk_os_t osnum_to_os(int n)
{
return os[n];
}
/* OS event notifier implementation */
int mcctrl_os_boot_notifier(int os_index)
{
int i;
int rc;
rc = -ENOENT;
for(i = 0; i < OS_MAX_MINOR; i++){
os[i] = ihk_host_find_os(i, NULL);
if (os[i]) {
printk("OS #%d found.\n", i);
rc = 0;
}
}
if(rc){
printk("OS not found.\n");
return rc;
os[os_index] = ihk_host_find_os(os_index, NULL);
if (!os[os_index]) {
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
return -EINVAL;
}
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
if (prepare_ikc_channels(os[i]) != 0) {
printk("Preparing syscall channels failed.\n");
os[i] = NULL;
}
}
if (prepare_ikc_channels(os[os_index]) != 0) {
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
os[os_index] = NULL;
return -EFAULT;
}
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
procfs_init(os_index);
printk("mcctrl: OS ID %d boot event handled\n", os_index);
return 0;
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
os[os_index] = NULL;
return rc;
}
int mcctrl_os_shutdown_notifier(int os_index)
{
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
}
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
.boot = mcctrl_os_boot_notifier,
.shutdown = mcctrl_os_shutdown_notifier,
};
static struct ihk_os_notifier mcctrl_os_notifier = {
.ops = &mcctrl_os_notifier_ops,
};
static int __init mcctrl_init(void)
{
int ret = 0;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
rus_page_hash_init();
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
if(rc < 0){
destroy_ikc_channels(os[i]);
os[i] = NULL;
}
procfs_init(i);
}
}
binfmt_mcexec_init();
return 0;
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
}
printk("mcctrl: initialized successfully.\n");
return ret;
error:
binfmt_mcexec_exit();
rus_page_hash_put_pages();
return ret;
}
static void __exit mcctrl_exit(void)
{
int i;
binfmt_mcexec_exit();
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
printk("mcctrl: warning: failed to deregister OS notifier??\n");
}
binfmt_mcexec_exit();
rus_page_hash_put_pages();
printk("mcctrl: unregistered.\n");
}
MODULE_LICENSE("GPL v2");

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h>
@ -40,19 +41,18 @@
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg);
void procfs_delete(void *__os, int osnum, unsigned long arg);
void procfs_answer(unsigned long arg, int err);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) {
switch (msg) {
case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break;
@ -66,15 +66,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_CREATE:
procfs_create(__os, pisp->ref, pisp->osnum, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_DELETE:
procfs_delete(__os, pisp->osnum, pisp->arg);
mcexec_syscall(usrdata, pisp);
break;
case SCD_MSG_PROCFS_ANSWER:
@ -84,6 +76,47 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_SEND_SIGNAL:
sig_done(pisp->arg, pisp->err);
break;
case SCD_MSG_SYSFS_REQ_CREATE:
case SCD_MSG_SYSFS_REQ_MKDIR:
case SCD_MSG_SYSFS_REQ_SYMLINK:
case SCD_MSG_SYSFS_REQ_LOOKUP:
case SCD_MSG_SYSFS_REQ_UNLINK:
case SCD_MSG_SYSFS_REQ_SETUP:
case SCD_MSG_SYSFS_RESP_SHOW:
case SCD_MSG_SYSFS_RESP_STORE:
case SCD_MSG_SYSFS_RESP_RELEASE:
sysfsm_packet_handler(__os, pisp->msg, pisp->err,
pisp->sysfs_arg1, pisp->sysfs_arg2);
break;
case SCD_MSG_PROCFS_TID_CREATE:
case SCD_MSG_PROCFS_TID_DELETE:
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
pisp->msg, pisp->ref, pisp->osnum, pisp->pid,
pisp->err, pisp->arg);
break;
}
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0;
}
@ -121,8 +154,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0;
}
@ -168,12 +199,13 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
#endif
pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL,
(void *)__get_free_pages(in_interrupt() ? GFP_ATOMIC : GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
pmc->param.post_va = (void *)__get_free_page(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
@ -193,8 +225,9 @@ static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ih
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
pmc->dma_buf = (void *)__get_free_pages(in_interrupt() ?
GFP_ATOMIC : GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
@ -240,9 +273,6 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
}
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@ -261,9 +291,6 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
@ -290,7 +317,7 @@ int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
int error;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
@ -322,14 +349,14 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
return 0;
}

View File

@ -0,0 +1,393 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <linux/fs.h>
#include <ihk/ihk_host_driver.h>
#include <linux/resource.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req;
struct ikc_scd_packet *packet;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
};
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
};
struct sysfsm_req {
int busy;
int padding;
long lresult;
wait_queue_head_t wq;
};
struct sysfsm_data {
size_t sysfs_bufsize;
void *sysfs_buf;
long sysfs_buf_rpa;
long sysfs_buf_pa;
struct kobject *sysfs_kobj;
struct sysfsm_node *sysfs_root;
struct semaphore sysfs_tree_sem;
struct semaphore sysfs_io_sem;
struct sysfsm_req sysfs_req;
ihk_os_t sysfs_os;
};
static inline int sysfs_inited(struct sysfsm_data *sdp)
{
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
struct list_head chain;
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
struct ihk_cpu_topology *saved;
cpumask_t core_siblings;
cpumask_t thread_siblings;
struct list_head chain;
struct list_head cache_list;
};
struct node_topology {
struct ihk_node_topology *saved;
cpumask_t cpumap;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 1000
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);
void delete_pid_entry(int osnum, int pid);
void proc_exe_link(int osnum, int pid, const char *path);
void procfs_init(int osnum);
void procfs_exit(int osnum);
/* sysfs_files.c */
void setup_sysfs_files(ihk_os_t os);
void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
void get_vdso_info(ihk_os_t os, long vdso_pa);
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
wait_queue_head_t wq;
};
#endif

View File

@ -0,0 +1,837 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include <linux/version.h>
#include <linux/semaphore.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
typedef uid_t kuid_t;
typedef gid_t kgid_t;
#endif
struct procfs_entry {
char *name;
mode_t mode;
const struct file_operations *fops;
};
#define NOD(NAME, MODE, FOP) { \
.name = (NAME), \
.mode = MODE, \
.fops = FOP, \
}
#define PROC_DIR(NAME, MODE) \
NOD(NAME, (S_IFDIR|(MODE)), NULL)
#define PROC_REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), fops)
#define PROC_TERM \
NOD(NULL, 0, NULL)
static const struct procfs_entry tid_entry_stuff[];
static const struct procfs_entry pid_entry_stuff[];
static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
struct list_head children;
int osnum;
char *data;
char name[0];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
DEFINE_SEMAPHORE(procfs_file_list_lock);
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
{
char *w = buf + bufsize - 1;
*w = '\0';
for(;;){
int l = strlen(e->name);
w -= l;
memcpy(w, e->name, l);
e = e->parent;
if(!e)
return w;
w--;
*w = '/';
}
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void
procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
static struct procfs_list_entry *
find_procfs_entry(struct procfs_list_entry *parent, const char *name)
{
struct list_head *list;
struct procfs_list_entry *e;
if(parent == NULL)
list = &procfs_file_list;
else
list = &parent->children;
list_for_each_entry(e, list, list) {
if(!strcmp(e->name, name))
return e;
}
return NULL;
}
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *n;
list_del(&top->list);
list_for_each_entry_safe(e, n, &top->children, list) {
delete_procfs_entries(e);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)
kfree(top->data);
kfree(top);
}
static struct procfs_list_entry *
add_procfs_entry(struct procfs_list_entry *parent, const char *name, int mode,
kuid_t uid, kgid_t gid, const void *opaque)
{
struct procfs_list_entry *e = find_procfs_entry(parent, name);
struct proc_dir_entry *pde;
struct proc_dir_entry *parent_pde = NULL;
int f_mode = mode & 0777;
if(e)
delete_procfs_entries(e);
e = kmalloc(sizeof(struct procfs_list_entry) + strlen(name) + 1,
GFP_KERNEL);
if(!e){
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
memset(e, '\0', sizeof(struct procfs_list_entry));
INIT_LIST_HEAD(&e->children);
strcpy(e->name, name);
if(parent)
parent_pde = parent->entry;
if (mode & S_IFDIR) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent_pde);
#else
pde = proc_mkdir_data(name, f_mode, parent_pde, e);
#endif
}
else if ((mode & S_IFLNK) == S_IFLNK) {
pde = proc_symlink(name, parent_pde, (char *)opaque);
}
else {
const struct file_operations *fop;
if(opaque)
fop = (const struct file_operations *)opaque;
else if(mode & S_IWUSR)
fop = &mckernel_forward;
else
fop = &mckernel_forward_ro;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, f_mode, parent_pde);
if(pde)
pde->proc_fops = fop;
#else
pde = proc_create_data(name, f_mode, parent_pde, fop, e);
if(pde)
proc_set_user(pde, uid, gid);
#endif
}
if(!pde){
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", name);
kfree(e);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->uid = uid;
pde->gid = gid;
pde->data = e;
#endif
if(parent)
e->osnum = parent->osnum;
e->entry = pde;
e->parent = parent;
list_add(&(e->list), parent? &(parent->children): &procfs_file_list);
return e;
}
static void
add_procfs_entries(struct procfs_list_entry *parent,
const struct procfs_entry *entries, kuid_t uid, kgid_t gid)
{
const struct procfs_entry *p;
for(p = entries; p->name; p++){
add_procfs_entry(parent, p->name, p->mode, uid, gid, p->fops);
}
}
static const struct cred *
get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
return __task_cred(task);
}
}
return NULL;
}
static struct procfs_list_entry *
find_base_entry(int osnum)
{
char name[12];
sprintf(name, "mcos%d", osnum);
return find_procfs_entry(NULL, name);
}
static struct procfs_list_entry *
find_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_base_entry(osnum)))
return NULL;
sprintf(name, "%d", pid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
find_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_pid_entry(osnum, pid)))
return NULL;
if(!(e = find_procfs_entry(e, "task")))
return NULL;
sprintf(name, "%d", tid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
get_base_entry(int osnum)
{
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
e = find_procfs_entry(NULL, name);
if(!e){
e = add_procfs_entry(NULL, name, S_IFDIR | 0555,
uid, gid, NULL);
e->osnum = osnum;
}
return e;
}
static struct procfs_list_entry *
get_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static struct procfs_list_entry *
get_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
if(!(parent = find_procfs_entry(parent, name)))
return NULL;
if(!(parent = find_procfs_entry(parent, "task")))
return NULL;
sprintf(name, "%d", tid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static void
_add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *exe;
parent = get_tid_entry(osnum, pid, tid);
if(parent){
add_procfs_entries(parent, tid_entry_stuff,
cred->uid, cred->gid);
exe = find_procfs_entry(parent->parent->parent, "exe");
if(exe){
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
cred->uid, cred->gid, exe->data);
}
}
}
void
add_tid_entry(int osnum, int pid, int tid)
{
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
up(&procfs_file_list_lock);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
up(&procfs_file_list_lock);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
delete_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
struct procfs_list_entry *e;
e = add_procfs_entry(parent, "exe", S_IFLNK | 0777, uid, gid,
path);
e->data = kmalloc(strlen(path) + 1, GFP_KERNEL);
strcpy(e->data, path);
task = find_procfs_entry(parent, "task");
list_for_each_entry(parent, &task->children, list) {
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
uid, gid, path);
}
}
up(&procfs_file_list_lock);
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
up(&procfs_file_list_lock);
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void
procfs_exit(int osnum)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static loff_t
mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
static const struct file_operations mckernel_forward = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = mckernel_procfs_write,
};
static const struct procfs_entry tid_entry_stuff[] = {
// PROC_REG("auxv", S_IRUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("pagemap", S_IRUGO, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
// PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("coredump_filter", S_IRUGO|S_IWUSR, NULL),
PROC_REG("cpuset", S_IXUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
PROC_REG("pagemap", S_IRUGO, NULL),
PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
PROC_DIR("task", S_IRUGO|S_IXUGO),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry base_entry_stuff[] = {
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("cpuinfo", S_IRUGO, NULL),
// PROC_REG("meminfo", S_IRUGO, NULL),
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
// PROC_REG("softirq", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("uptime", S_IRUGO, NULL),
// PROC_REG("version", S_IRUGO, NULL),
// PROC_REG("vmallocinfo",S_IRUSR, NULL),
// PROC_REG("vmstat", S_IRUGO, NULL),
// PROC_REG("zoneinfo", S_IRUGO, NULL),
PROC_TERM
};

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCCTRL_SYSFS_H
#define MCCTRL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfsm_ops {
ssize_t (*show)(struct sysfsm_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfsm_ops *ops, void *instance,
const void *buf, size_t bufsize);
void (*release)(struct sysfsm_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfsm_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfsm_createf(ihk_os_t os, struct sysfsm_ops *ops, void *instance,
int mode, const char *fmt, ...);
extern int sysfsm_mkdirf(ihk_os_t os, sysfs_handle_t *dirhp,
const char *fmt, ...);
extern int sysfsm_symlinkf(ihk_os_t os, sysfs_handle_t targeth,
const char *fmt, ...);
extern int sysfsm_lookupf(ihk_os_t os, sysfs_handle_t *objhp,
const char *fmt, ...);
extern int sysfsm_unlinkf(ihk_os_t os, int flags, const char *fmt, ...);
extern void sysfsm_cleanup(ihk_os_t os);
extern void sysfsm_packet_handler(void *os, int msg, int err, long arg1,
long arg2);
#endif /* MCCTRL_SYSFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -0,0 +1,40 @@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
.PHONY: clean install modules
modules:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
endif
clean:
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
install:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make install)
endif

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,416 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,951 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,438 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,557 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,488 +0,0 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include "mcctrl.h"
#include <linux/version.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
ihk_os_t os;
int osnum;
int pid;
int cpu;
char fname[PROCFS_NAME_MAX];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
loff_t mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
static const struct file_operations mckernel_procfs_file_operations = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
/**
* \brief Return specified procfs entry.
*
* \param p a name of the procfs file
* \param osnum os number
* \param mode if zero create a directory otherwise a file
*
* return value: NULL: Something wrong has occurred.
* otherwise: address of the proc_dir_entry structure of the procfs file
*
* p should not be NULL nor terminated by "/".
*
* We create a procfs entry if there is not already one.
* This process is recursive to the root of the procfs tree.
*/
/*
* XXX: Two or more entries which have same name can be created.
*
* get_procfs_list_entry() avoids creating an entry which has already been created.
* But, it allows creating an entry which is being created by another thread.
*
* This problem occurred when two requests which created files with a common
* ancestor directory which was not explicitly created were racing.
*/
static struct procfs_list_entry *get_procfs_list_entry(char *p, int osnum, int mode)
{
char *r;
struct proc_dir_entry *pde = NULL;
struct procfs_list_entry *e, *ret = NULL, *parent = NULL;
char name[PROCFS_NAME_MAX];
unsigned long irqflags;
dprintk("get_procfs_list_entry: %s for osnum %d mode %o\n", p, osnum, mode);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if (e == NULL) {
kprintf("ERROR: The procfs_file_list has a null entry.\n");
return NULL;
}
if (strncmp(e->fname, p, PROCFS_NAME_MAX) == 0) {
/* We found the entry */
ret = e;
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
if (ret != NULL) {
return ret;
}
r = strrchr(p, '/');
if (r != NULL) {
/* We have non-null parent dir. */
strncpy(name, p, r - p);
name[r - p] = '\0';
parent = get_procfs_list_entry(name, osnum, 0);
if (parent == NULL) {
/* We counld not get a parent procfs entry. Give up.*/
return NULL;
}
}
ret = kmalloc(sizeof(struct procfs_list_entry), GFP_KERNEL);
if (ret == NULL) {
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
/* Fill the fname field of the entry */
strncpy(ret->fname, p, PROCFS_NAME_MAX);
if (r != NULL) {
strncpy(name, r + 1, p + PROCFS_NAME_MAX - r - 1);
} else {
strncpy(name, p, PROCFS_NAME_MAX);
}
if (mode == 0) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent ? parent->entry : NULL);
#else
pde = proc_mkdir_data(name, 0555, parent ? parent->entry : NULL, ret);
#endif
} else {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, mode, parent->entry);
if (pde)
pde->proc_fops = &mckernel_procfs_file_operations;
#else
pde = proc_create_data(name, mode, parent->entry,
&mckernel_procfs_file_operations, ret);
#endif
}
if (pde == NULL) {
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", p);
kfree(ret);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->data = ret;
#endif
ret->osnum = osnum;
ret->entry = pde;
ret->parent = parent;
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_add(&(ret->list), &procfs_file_list);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
dprintk("get_procfs_list_entry: %s done\n", p);
return ret;
}
/**
* \brief Create a procfs entry.
*
* \param __os (opeque) os variable
* \param ref cpuid of the requesting mckernel process
* \param osnum osnum of the requesting mckernel process
* \param pid pid of the requesting mckernel process
* \param arg sent argument
*/
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg)
{
struct procfs_list_entry *e;
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
int mode;
char name[PROCFS_NAME_MAX];
dprintk("procfs_create: osnum: %d, cpu: %d, pid: %d\n", osnum, ref, pid);
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("name: %s mode: %o\n", f->fname, f->mode);
strncpy(name, f->fname, PROCFS_NAME_MAX);
mode = f->mode;
if (name[PROCFS_NAME_MAX - 1] != '\0') {
printk("ERROR: procfs_creat: file name not properly terminated.\n");
goto quit;
}
e = get_procfs_list_entry(name, osnum, mode);
if (e == NULL) {
printk("ERROR: could not create a procfs entry for %s.\n", name);
goto quit;
}
e->os = __os;
e->cpu = ref;
e->pid = pid;
quit:
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_create: done\n");
}
/**
* \brief Delete a procfs entry.
*
* \param __os (opaque) os variable
* \param osnum os number
* \param arg sent argument
*/
void procfs_delete(void *__os, int osnum, unsigned long arg)
{
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
struct procfs_list_entry *e;
struct procfs_list_entry *parent = NULL;
char name[PROCFS_NAME_MAX];
char *r;
unsigned long irqflags;
dprintk("procfs_delete: \n");
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("fname: %s.\n", f->fname);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if ((strncmp(e->fname, f->fname, PROCFS_NAME_MAX) == 0) &&
(e->osnum == osnum)) {
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
kfree(e);
r = strrchr(f->fname, '/');
if (r == NULL) {
strncpy(name, f->fname, PROCFS_NAME_MAX);
} else {
strncpy(name, r + 1, PROCFS_NAME_MAX);
}
dprintk("found and remove %s from the list.\n", name);
remove_proc_entry(name, parent->entry);
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_delete: done\n");
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer;
int order = 0;
volatile struct procfs_read *r;
struct ikc_scd_packet isp;
int ret, retrycount = 0;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
e->fname, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
return -ENOMEM;
}
retry:
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
strncpy((char *)r->fname, e->fname, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = e->cpu;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(e->os, e->cpu, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if ((r->ret == 0) && (r->eof != 1)) {
/* A miss-hit caused by migration has occurred.
* We simply retry the query with a new CPU.
*/
if (retrycount++ > 10) {
kprintf("ERROR: mckernel_procfs_read: excessive retry.\n");
goto out;
}
e->cpu = r->newcpu;
dprintk("retry\n");
goto retry;
}
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
free_pages((uintptr_t)kern_buffer, order);
kfree((void *)r);
return ret;
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void procfs_init(int osnum) {
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void procfs_exit(int osnum) {
char buf[20], *r;
int error;
mm_segment_t old_fs = get_fs();
struct kstat stat;
struct procfs_list_entry *parent;
struct procfs_list_entry *e, *temp = NULL;
unsigned long irqflags;
dprintk("remove remaining mckernel procfs files.\n");
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry_safe(e, temp, &procfs_file_list, list) {
if (e->osnum == osnum) {
dprintk("found entry for %s.\n", e->fname);
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
r = strrchr(e->fname, '/');
if (r == NULL) {
r = e->fname;
} else {
r += 1;
}
if (parent) {
remove_proc_entry(r, parent->entry);
}
dprintk("free the entry\n");
kfree(e);
}
dprintk("iterate it.\n");
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
sprintf(buf, "/proc/mcos%d", osnum);
set_fs(KERNEL_DS);
error = vfs_stat (buf, &stat);
set_fs(old_fs);
if (error != 0) {
return;
}
printk("procfs_exit: We have to remove unexpectedly remaining %s.\n", buf);
/* remove remnant of previous mcos%d */
remove_proc_entry(buf + 6, NULL);
}

View File

@ -1,13 +1,19 @@
CC=@CC@
BINDIR=@BINDIR@
CFLAGS=-Wall -O -fPIE -pie
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@
TARGET=mcexec
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
all: $(TARGET)
mcexec: mcexec.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -pthread -o $@ $^ $(EXTRA_OBJS)
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
clean:
$(RM) $(TARGET) *.o
@ -17,4 +23,5 @@ clean:
install:
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

1026
executer/user/eclair.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -59,7 +59,12 @@
#include <semaphore.h>
#include <signal.h>
#include <sys/signalfd.h>
#include <sys/mount.h>
#include <include/generated/uapi/linux/version.h>
#include <sys/user.h>
#include "../include/uprotocol.h"
#include <getopt.h>
#include "../config.h"
//#define DEBUG
@ -96,6 +101,19 @@ int __glob_argc = -1;
char **__glob_argv = 0;
#endif
#ifdef ENABLE_MCOVERLAYFS
#undef ENABLE_MCOVERLAYFS
#ifndef RHEL_RELEASE_CODE
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define ENABLE_MCOVERLAYFS 1
#endif // LINUX_VERSION_CODE == 4.0
#else
#if RHEL_RELEASE_CODE == RHEL_RELEASE_VERSION(7,2)
#define ENABLE_MCOVERLAYFS 1
#endif // RHEL_RELEASE_CODE == 7.2
#endif // RHEL_RELEASE_CODE
#endif // ENABLE_MCOVERLAYFS
typedef unsigned char cc_t;
typedef unsigned int speed_t;
typedef unsigned int tcflag_t;
@ -129,6 +147,7 @@ static char *exec_path = NULL;
static char *altroot;
static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK";
static int ischild;
static int enable_vdso = 1;
struct fork_sync {
pid_t pid;
@ -183,6 +202,8 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
desc = malloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs);
memset(desc, '\0', sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs);
desc->shell_path[0] = '\0';
fseek(fp, hdr.e_phoff, SEEK_SET);
j = 0;
@ -243,6 +264,8 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
}
desc->pid = getpid();
desc->pgid = getpgid(0);
if(*interp_pathp)
desc->reloc = hdr.e_type == ET_DYN;
desc->entry = hdr.e_entry;
ioctl(fd, MCEXEC_UP_GET_CREDV, desc->cred);
desc->at_phdr = load_addr + hdr.e_phoff;
@ -365,7 +388,7 @@ struct program_load_desc *load_interp(struct program_load_desc *desc0, FILE *fp)
unsigned char *dma_buf;
int lookup_exec_path(char *filename, char *path, int max_len)
int lookup_exec_path(char *filename, char *path, int max_len, int execvp)
{
int found;
int error;
@ -383,28 +406,27 @@ retry:
char *token, *string, *tofree;
char *PATH = getenv("COKERNEL_PATH");
if (!PATH) {
if (!execvp) {
if (strlen(filename) + 1 > max_len) {
return ENAMETOOLONG;
}
strcpy(path, filename);
error = access(path, X_OK);
if (error) {
return errno;
}
found = 1;
break;
}
if (!(PATH = getenv("COKERNEL_PATH"))) {
PATH = getenv("PATH");
}
if (strlen(filename) >= 255) {
return ENAMETOOLONG;
}
/* See first whether file is available in current working dir */
error = access(filename, X_OK);
if (error == 0) {
__dprintf("lookup_exec_path(): found %s in cwd\n", filename);
error = snprintf(path, max_len, "%s", filename);
if (error < 0 || error >= max_len) {
fprintf(stderr, "lookup_exec_path(): array too small?\n");
return ENOMEM;
}
found = 1;
break;
}
__dprintf("PATH: %s\n", PATH);
@ -432,6 +454,9 @@ retry:
}
free(tofree);
if(!found){
return ENOENT;
}
break;
}
@ -478,7 +503,7 @@ retry:
}
if ((sb.st_mode & S_IFMT) == S_IFLNK) {
char *link_path = malloc(max_len);
link_path = malloc(max_len);
if (!link_path) {
fprintf(stderr, "lookup_exec_path(): error allocating\n");
return ENOMEM;
@ -489,9 +514,18 @@ retry:
fprintf(stderr, "lookup_exec_path(): error readlink\n");
return EINVAL;
}
link_path[error] = '\0';
__dprintf("lookup_exec_path(): %s is link -> %s\n", path, link_path);
if(link_path[0] != '/'){
char *t = strrchr(path, '/');
if(t){
t++;
strcpy(t, link_path);
strcpy(link_path, path);
}
}
filename = link_path;
goto retry;
}
@ -635,10 +669,7 @@ int load_elf_desc(char *filename, struct program_load_desc **desc_p,
return 0;
}
#define PAGE_SIZE 4096
#define PAGE_MASK ~((unsigned long)PAGE_SIZE - 1)
void transfer_image(int fd, struct program_load_desc *desc)
int transfer_image(int fd, struct program_load_desc *desc)
{
struct remote_transfer pt;
unsigned long s, e, flen, rpa;
@ -652,13 +683,17 @@ void transfer_image(int fd, struct program_load_desc *desc)
+ PAGE_SIZE - 1) & PAGE_MASK;
rpa = desc->sections[i].remote_pa;
fseek(fp, desc->sections[i].offset, SEEK_SET);
if (fseek(fp, desc->sections[i].offset, SEEK_SET) != 0) {
fprintf(stderr, "transfer_image(): error: seeking file position\n");
return -1;
}
flen = desc->sections[i].filesz;
__dprintf("seeked to %lx | size %ld\n",
desc->sections[i].offset, flen);
while (s < e) {
memset(&pt, '\0', sizeof pt);
pt.rphys = rpa;
pt.userp = dma_buf;
pt.size = PAGE_SIZE;
@ -673,7 +708,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
if (lr > flen) {
lr = flen;
}
fread(dma_buf + l, 1, lr, fp);
if (fread(dma_buf + l, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
else if (flen > 0) {
@ -682,7 +730,20 @@ void transfer_image(int fd, struct program_load_desc *desc)
} else {
lr = flen;
}
fread(dma_buf, 1, lr, fp);
if (fread(dma_buf, 1, lr, fp) != lr) {
if (ferror(fp) > 0) {
fprintf(stderr, "transfer_image(): error: accessing file\n");
return -EINVAL;
}
else if (feof(fp) > 0) {
fprintf(stderr, "transfer_image(): file too short?\n");
return -EINVAL;
}
else {
/* TODO: handle smaller reads.. */
return -EINVAL;
}
}
flen -= lr;
}
s += PAGE_SIZE;
@ -698,6 +759,8 @@ void transfer_image(int fd, struct program_load_desc *desc)
}
}
}
return 0;
}
void print_desc(struct program_load_desc *desc)
@ -762,7 +825,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
}
/* Count full length */
full_len = sizeof(int) + sizeof(char *); // Counter and terminating NULL
full_len = sizeof(long) + sizeof(char *); // Counter and terminating NULL
if (first) {
full_len += sizeof(char *) + strlen(first) + 1;
}
@ -772,6 +835,8 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
full_len += sizeof(char *) + strlen(strings[string_i]) + 1;
}
full_len = (full_len + sizeof(long) - 1) & ~(sizeof(long) - 1);
_flat = (char *)malloc(full_len);
if (!_flat) {
return 0;
@ -780,14 +845,14 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
memset(_flat, 0, full_len);
/* Number of strings */
*((int*)_flat) = nr_strings + (first ? 1 : 0);
*((long *)_flat) = nr_strings + (first ? 1 : 0);
// Actual offset
flat_offset = sizeof(int) + sizeof(char *) * (nr_strings + 1 +
flat_offset = sizeof(long) + sizeof(char *) * (nr_strings + 1 +
(first ? 1 : 0));
if (first) {
*((char **)(_flat + sizeof(int))) = (void *)flat_offset;
*((char **)(_flat + sizeof(long))) = (void *)flat_offset;
memcpy(_flat + flat_offset, first, strlen(first) + 1);
flat_offset += strlen(first) + 1;
}
@ -795,7 +860,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
for (string_i = 0; string_i < nr_strings; ++string_i) {
/* Fabricate the string */
*((char **)(_flat + sizeof(int) + (string_i + (first ? 1 : 0))
*((char **)(_flat + sizeof(long) + (string_i + (first ? 1 : 0))
* sizeof(char *))) = (void *)flat_offset;
memcpy(_flat + flat_offset, strings[string_i], strlen(strings[string_i]) + 1);
flat_offset += strlen(strings[string_i]) + 1;
@ -818,7 +883,10 @@ struct thread_data_s {
pthread_mutex_t *lock;
pthread_barrier_t *init_ready;
} *thread_data;
int ncpu;
int n_threads;
pid_t master_tid;
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
@ -829,7 +897,7 @@ static void *main_loop_thread_func(void *arg)
struct thread_data_s *td = (struct thread_data_s *)arg;
td->tid = gettid();
td->remote_tid = (int)td->tid;
td->remote_tid = -1;
pthread_barrier_wait(&init_ready);
td->ret = main_loop(td->fd, td->cpu, td->lock);
@ -878,6 +946,7 @@ sendsig(int sig, siginfo_t *siginfo, void *context)
remote_tid = -1;
}
memset(&sigdesc, '\0', sizeof sigdesc);
sigdesc.cpu = cpu;
sigdesc.pid = (int)pid;
sigdesc.tid = remote_tid;
@ -904,13 +973,17 @@ act_signalfd4(struct syscall_wait_desc *w)
switch(mode){
case 0: /* new signalfd */
sfd = malloc(sizeof(struct sigfd));
memset(sfd, '\0', sizeof(struct sigfd));
tmp = w->sr.args[1];
flags = 0;
if(tmp & SFD_NONBLOCK)
flags |= O_NONBLOCK;
if(tmp & SFD_CLOEXEC)
flags |= O_CLOEXEC;
pipe2(sfd->sigpipe, flags);
if (pipe2(sfd->sigpipe, flags) < 0) {
perror("pipe2 failed:");
return -1;
}
sfd->next = sigfdtop;
sigfdtop = sfd;
rc = sfd->sigpipe[0];
@ -941,7 +1014,11 @@ act_signalfd4(struct syscall_wait_desc *w)
rc = -EBADF;
else{
info = (struct signalfd_siginfo *)w->sr.args[2];
write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo));
if (write(sfd->sigpipe[1], info, sizeof(struct signalfd_siginfo))
!= sizeof(struct signalfd_siginfo)) {
fprintf(stderr, "error: writing sigpipe\n");
rc = -EBADF;
}
}
break;
}
@ -1047,9 +1124,9 @@ void init_worker_threads(int fd)
int i;
pthread_mutex_init(&lock, NULL);
pthread_barrier_init(&init_ready, NULL, ncpu + 2);
pthread_barrier_init(&init_ready, NULL, n_threads + 2);
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
int ret;
thread_data[i].fd = fd;
@ -1069,6 +1146,80 @@ void init_worker_threads(int fd)
pthread_barrier_wait(&init_ready);
}
#ifdef ENABLE_MCOVERLAYFS
#define READ_BUFSIZE 1024
static int isunshare(void)
{
int err = 0;
int ret;
int fd;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
break;
}
}
close(fd);
__dprintf("err=%d\n", err);
return err;
}
#endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0
#define MCK_RLIMIT_CORE 1
#define MCK_RLIMIT_CPU 2
@ -1139,6 +1290,24 @@ static int rlimits[] = {
char dev[64];
static struct option mcexec_options[] = {
{
.name = "disable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 0,
},
{
.name = "enable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 1,
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv)
{
// int fd;
@ -1190,12 +1359,15 @@ int main(int argc, char **argv)
}
/* Parse options ("+" denotes stop at the first non-option) */
while ((opt = getopt(argc, argv, "+c:")) != -1) {
while ((opt = getopt_long(argc, argv, "+c:", mcexec_options, NULL)) != -1) {
switch (opt) {
case 'c':
target_core = atoi(optarg);
break;
case 0: /* long opt */
break;
default: /* '?' */
print_usage(argv);
exit(EXIT_FAILURE);
@ -1234,7 +1406,59 @@ int main(int argc, char **argv)
return 1;
}
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) {
#ifdef ENABLE_MCOVERLAYFS
__dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX];
error = isunshare();
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
unshare_desc.unshare_flags = CLONE_NEWNS;
if (ioctl(fd, MCEXEC_UP_SYS_UNSHARE,
(unsigned long)&unshare_desc) != 0) {
fprintf(stderr, "Error: Failed to unshare. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /proc. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_sysdir, "/tmp/mcos/mcos%d_sys", mcosid);
mount_desc.dev_name = mcos_sysdir;
mount_desc.dir_name = "/sys";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /sys. (%s)\n",
strerror(errno));
return 1;
}
} else if (error == -1) {
return 1;
}
#else
__dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path), 1) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1;
}
@ -1246,7 +1470,7 @@ int main(int argc, char **argv)
/* Check whether shell script */
if (shell) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path)) != 0) {
if (lookup_exec_path(shell, shell_path, sizeof(shell_path), 0) != 0) {
fprintf(stderr, "error: finding file: %s\n", shell);
return 1;
}
@ -1272,6 +1496,8 @@ int main(int argc, char **argv)
//print_flat(args);
desc->cpu = target_core;
desc->enable_vdso = enable_vdso;
p = getenv(rlimit_stack_envname);
if (p) {
errno = 0;
@ -1306,6 +1532,19 @@ int main(int argc, char **argv)
return 1;
}
n_threads = ncpu;
if (ncpu > 16) {
n_threads = 16;
}
/*
* XXX: keep thread_data ncpu sized despite that there are only
* n_threads worker threads in the pool so that signaling code
* keeps working.
*
* TODO: fix signaling code to be independent of TIDs.
* TODO: implement dynaic thread pool resizing.
*/
thread_data = (struct thread_data_s *)malloc(sizeof(struct thread_data_s) * (ncpu + 1));
memset(thread_data, '\0', sizeof(struct thread_data_s) * (ncpu + 1));
@ -1348,7 +1587,10 @@ int main(int argc, char **argv)
}
print_desc(desc);
transfer_image(fd, desc);
if (transfer_image(fd, desc) < 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
fflush(stdout);
fflush(stderr);
@ -1387,7 +1629,7 @@ int main(int argc, char **argv)
return 1;
}
for (i = 0; i <= ncpu; ++i) {
for (i = 0; i <= n_threads; ++i) {
pthread_join(thread_data[i].thread_id, NULL);
}
@ -1401,6 +1643,7 @@ void do_syscall_return(int fd, int cpu,
{
struct syscall_ret_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu;
desc.ret = ret;
desc.src = src;
@ -1417,6 +1660,7 @@ void do_syscall_load(int fd, int cpu, unsigned long dest, unsigned long src,
{
struct syscall_load_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu;
desc.src = src;
desc.dest = dest;
@ -1447,16 +1691,14 @@ do_generic_syscall(
}
static void
kill_thread(unsigned long cpu)
kill_thread(unsigned long tid)
{
if(cpu >= 0 && cpu < ncpu){
pthread_kill(thread_data[cpu].thread_id, LOCALSIG);
}
else{
int i;
int i;
for (i = 0; i < ncpu; ++i) {
for (i = 0; i < n_threads; ++i) {
if(thread_data[i].remote_tid == tid){
pthread_kill(thread_data[i].thread_id, LOCALSIG);
break;
}
}
}
@ -1466,6 +1708,7 @@ static long do_strncpy_from_user(int fd, void *dest, void *src, unsigned long n)
struct strncpy_from_user_desc desc;
int ret;
memset(&desc, '\0', sizeof desc);
desc.dest = dest;
desc.src = src;
desc.n = n;
@ -1560,6 +1803,9 @@ int close_cloexec_fds(int mcos_fd)
char *
chgpath(char *in, char *buf)
{
#ifdef ENABLE_MCOVERLAYFS
return in;
#endif // ENABLE_MCOVERLAYFS
char *fn = in;
struct stat sb;
@ -1589,10 +1835,11 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
char *fn;
int sig;
int term;
struct timeval tv;
struct timespec tv;
char pathbuf[PATH_MAX];
char tmpbuf[PATH_MAX];
memset(&w, '\0', sizeof w);
w.cpu = cpu;
w.pid = getpid();
@ -1608,6 +1855,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
//pthread_mutex_lock(lock);
thread_data[cpu].remote_tid = w.sr.rtid;
switch (w.sr.number) {
case __NR_open:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
@ -1628,13 +1877,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
break;
case __NR_futex:
ret = gettimeofday(&tv, NULL);
ret = clock_gettime(w.sr.args[1], &tv);
SET_ERR(ret);
__dprintf("gettimeofday=%016ld,%09ld\n",
__dprintf("clock_gettime=%016ld,%09ld\n",
tv.tv_sec,
tv.tv_usec);
tv.tv_nsec);
do_syscall_return(fd, cpu, ret, 1, (unsigned long)&tv,
w.sr.args[0], sizeof(struct timeval));
w.sr.args[0], sizeof(struct timespec));
break;
case __NR_kill: // interrupt syscall
@ -1646,13 +1895,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
sig = 0;
term = 0;
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
/* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
}
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){
@ -1690,17 +1939,12 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
return w.sr.args[0];
case __NR_mmap:
case __NR_munmap:
case __NR_mprotect:
/* reserved for internal use */
do_syscall_return(fd, cpu, -ENOSYS, 0, 0, 0, 0);
break;
case __NR_munmap:
ret = madvise((void *)w.sr.args[0], w.sr.args[1], MADV_DONTNEED);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
#ifdef USE_SYSCALL_MOD_CALL
case 303:{
__dprintf("mcexec.c,mod_cal,mod=%ld,cmd=%ld\n", w.sr.args[0], w.sr.args[1]);
@ -1725,6 +1969,39 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
thread_data[oldcpuid].remote_tid = wtid;
}
/*
* Number of TIDs and the remote physical address where TIDs are
* expected are passed in arg 4 and 5, respectively.
*/
if (w.sr.args[4] > 0) {
struct remote_transfer trans;
int i = 0;
int *tids = malloc(sizeof(int) * w.sr.args[4]);
if (!tids) {
fprintf(stderr, "__NR_gettid(): error allocating TIDs\n");
goto gettid_out;
}
for (i = 0; i < ncpu && i < w.sr.args[4]; ++i) {
tids[i] = thread_data[i].tid;
}
for (; i < ncpu; ++i) {
tids[i] = 0;
}
trans.userp = (void*)tids;
trans.rphys = w.sr.args[5];
trans.size = sizeof(int) * w.sr.args[4];
trans.direction = MCEXEC_UP_TRANSFER_TO_REMOTE;
if (ioctl(fd, MCEXEC_UP_TRANSFER, &trans) != 0) {
fprintf(stderr, "__NR_gettid(): error transfering TIDs\n");
}
free(tids);
}
gettid_out:
do_syscall_return(fd, cpu, thread_data[newcpuid].remote_tid, 0, 0, 0, 0);
break;
}
@ -1734,6 +2011,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
struct fork_sync_container *fsc;
struct fork_sync_container *fp;
struct fork_sync_container *fb;
int flag = w.sr.args[0];
int rc = -1;
pid_t pid;
@ -1753,7 +2031,45 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
memset(fs, '\0', sizeof(struct fork_sync));
sem_init(&fs->sem, 1, 0);
pid = fork();
if(flag){
int pipefds[2];
if(pipe(pipefds) == -1){
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
pid = fork();
if(pid == 0){
close(pipefds[0]);
pid = fork();
if(pid != 0){
if (write(pipefds[1], &pid, sizeof pid) != sizeof(pid)) {
fprintf(stderr, "error: writing pipefds\n");
}
exit(0);
}
}
else if(pid != -1){
int npid;
int st;
close(pipefds[1]);
if (read(pipefds[0], &npid, sizeof npid) != sizeof(npid)) {
fprintf(stderr, "error: reading pipefds\n");
}
close(pipefds[0]);
waitpid(pid, &st, 0);
pid = npid;
}
else{
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
}
else
pid = fork();
switch (pid) {
/* Error */
@ -1781,7 +2097,6 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
/* Reinit signals and syscall threads */
init_sigaction();
init_worker_threads(fd);
__dprintf("pid(%d): signals and syscall threads OK\n",
getpid());
@ -1795,6 +2110,8 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
goto fork_child_sync_pipe;
}
init_worker_threads(fd);
fork_child_sync_pipe:
sem_post(&fs->sem);
if (fs->status)
@ -1896,15 +2213,16 @@ fork_err:
char path[1024];
char *filename;
int ret;
char *shell = NULL;
char *shell;
char shell_path[1024];
/* Load descriptor phase */
case 1:
shell = NULL;
filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path)))
if ((ret = lookup_exec_path(filename, path, sizeof(path), 0))
!= 0) {
goto return_execve1;
}
@ -1918,7 +2236,7 @@ fork_err:
/* Check whether shell script */
if (shell) {
if ((ret = lookup_exec_path(shell, shell_path,
sizeof(shell_path))) != 0) {
sizeof(shell_path), 0)) != 0) {
fprintf(stderr, "execve(): error: finding file: %s\n", shell);
goto return_execve1;
}
@ -1939,6 +2257,7 @@ fork_err:
strcpy(desc->shell_path, shell_path);
}
desc->enable_vdso = enable_vdso;
__dprintf("execve(): load_elf_desc() for %s OK, num sections: %d\n",
path, desc->num_sections);
@ -1979,6 +2298,7 @@ return_execve1:
fprintf(stderr, "execve(): error allocating desc\n");
goto return_execve2;
}
memset(desc, '\0', w.sr.args[2]);
/* Copy descriptor from co-kernel side */
trans.userp = (void*)desc;
@ -1995,7 +2315,10 @@ return_execve1:
__dprintf("%s", "execve(): transfer ELF desc OK\n");
transfer_image(fd, desc);
if (transfer_image(fd, desc) != 0) {
fprintf(stderr, "error: transferring image\n");
return -1;
}
__dprintf("%s", "execve(): image transferred\n");
if (close_cloexec_fds(fd) < 0) {
@ -2021,6 +2344,11 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_perf_event_open:
ret = open("/dev/null", O_RDONLY);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_rt_sigaction:
act_sigaction(&w);
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
@ -2042,6 +2370,53 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresuid:
ret = setresuid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setreuid:
ret = setreuid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setuid:
ret = setuid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setresgid:
ret = setresgid(w.sr.args[0], w.sr.args[1], w.sr.args[2]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setregid:
ret = setregid(w.sr.args[0], w.sr.args[1]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setgid:
ret = setgid(w.sr.args[0]);
if(ret == -1)
ret = -errno;
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_setfsgid:
ret = setfsgid(w.sr.args[0]);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_close:
if(w.sr.args[0] == fd)
ret = -EBADF;
@ -2050,13 +2425,34 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_readlink:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
if (ret >= PATH_MAX) {
ret = -ENAMETOOLONG;
}
if (ret < 0) {
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
}
fn = chgpath(pathbuf, tmpbuf);
ret = readlink(fn, (char *)w.sr.args[1], w.sr.args[2]);
__dprintf("readlink: path=%s, buf=%s, ret=%ld\n",
fn, (char *)w.sr.args[1], ret);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
default:
ret = do_generic_syscall(&w);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
}
thread_data[cpu].remote_tid = -1;
//pthread_mutex_unlock(lock);
}
__dprint("timed out.\n");

View File

@ -3,7 +3,7 @@ OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__
CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS)
ifeq ("$(DCFA_MODE)", "kmod")

View File

@ -3,10 +3,10 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ -g
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o

View File

@ -28,19 +28,19 @@
int num_processors = 1;
static volatile int ap_stop = 1;
extern void zero_tsc(void);
static void ap_wait(void)
{
init_tick();
while (ap_stop) {
barrier();
cpu_pause();
}
zero_tsc();
sync_tick();
kmalloc_init();
sched_init();
arch_start_pvclock();
if (find_command_line("hidos")) {
init_host_syscall_channel();
@ -56,7 +56,9 @@ static void ap_wait(void)
void ap_start(void)
{
init_tick();
ap_stop = 0;
sync_tick();
}
void ap_init(void)
@ -66,6 +68,7 @@ void ap_init(void)
int bsp_hw_id;
ihk_mc_init_ap();
init_delay();
cpu_info = ihk_mc_get_cpu_info();
bsp_hw_id = ihk_mc_get_hardware_processor_id();
@ -89,3 +92,140 @@ void ap_init(void)
kprintf("AP Booting: Done\n");
}
#include <sysfs.h>
#include <kmalloc.h>
#include <string.h>
#include <vsprintf.h>
static ssize_t
show_int(struct sysfs_ops *ops, void *instance, void *buf, size_t size)
{
int *p = instance;
return snprintf(buf, size, "%d\n", *p);
}/* show_int() */
struct sysfs_ops show_int_ops = {
.show = &show_int,
};
struct fake_cpu_info {
int online;
};
static struct fake_cpu_info *fake_cpu_infos = NULL;
enum fake_cpu_info_member {
ONLINE,
};
struct fake_cpu_info_ops {
enum fake_cpu_info_member member;
struct sysfs_ops ops;
};
static ssize_t
show_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
n = snprintf(buf, size, "%d\n", info->online);
break;
default:
n = -EINVAL;
break;
}
if (n >= size) {
n = -ENOSPC;
}
return n;
} /* show_fake_cpu_info() */
static ssize_t
store_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
kprintf("NYI:store_fake_cpu_info(%p,%p,%p,%ld): "
"online %d --> \"%.*s\"\n",
ops0, instance, buf, size, info->online,
(int)size, buf);
n = size;
break;
default:
n = -EIO;
break;
}
return n;
} /* store_fake_cpu_info() */
static struct fake_cpu_info_ops show_fci_online = {
.member = ONLINE,
.ops.show = &show_fake_cpu_info,
.ops.store = &store_fake_cpu_info,
};
void
cpu_sysfs_setup(void)
{
int error;
int cpu;
sysfs_handle_t targeth;
struct fake_cpu_info *info;
/* sample of simple variable **********************************/
error = sysfs_createf(&show_int_ops, &num_processors, 0444,
"/sys/devices/system/cpu/num_processors");
if (error) {
panic("cpu_sysfs_setup:sysfs_createf(num_processors) failed\n");
}
/* sample of more complex variable ****************************/
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
}
fake_cpu_infos = info;
/* setup sysfs tree */
for (cpu = 0; cpu < num_processors; ++cpu) {
/* online */
error = sysfs_createf(&show_fci_online.ops,
&fake_cpu_infos[cpu], 0644,
"/sys/devices/system/cpu/cpu%d/online", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_createf failed\n");
}
/* link to cpu%d */
error = sysfs_lookupf(&targeth,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_lookupf failed\n");
}
error = sysfs_symlinkf(targeth, "/sys/bus/cpu/devices/cpu%d",
cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_symlinkf failed\n");
}
}
return;
} /* cpu_sysfs_setup() */

View File

@ -1,6 +1,5 @@
CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -3,6 +3,5 @@ LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump
OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/builtin-x86.lds

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/smp-x86.lds

View File

@ -22,13 +22,44 @@ extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern int sprintf(char * buf, const char *fmt, ...);
static ihk_spinlock_t kmsg_lock;
static unsigned long kprintf_lock_head(void);
static void kprintf_unlock_head(unsigned long irqflags);
static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
int head, tail, buf_len, mode, adj;
mode = kmsg_buf.mode;
while (1) {
adj = 0;
tail = kmsg_buf.tail;
buf_len = kmsg_buf.len;
head = kmsg_buf.head;
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
if (mode != 1) {
*slide = 1;
break;
} else {
kprintf_unlock_head(*flags_head);
*flags_head = kprintf_lock_head();
}
} else {
break;
}
}
}
/* TODO: lock */
void kputs(char *buf)
{
int len = strlen(buf);
unsigned long flags;
int slide = 0;
unsigned long flags_tail, flags_head;
flags = __ihk_mc_spinlock_lock(&kmsg_lock);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
if (len + kmsg_buf.tail > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -39,8 +70,12 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
__ihk_mc_spinlock_unlock(&kmsg_lock, flags);
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
}
#define KPRINTF_LOCAL_BUF_LEN 1024
@ -55,44 +90,34 @@ void kprintf_unlock(unsigned long irqflags)
__ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
}
static unsigned long kprintf_lock_head(void)
{
return __ihk_mc_spinlock_lock(&kmsg_buf.lock);
}
static void kprintf_unlock_head(unsigned long irqflags)
{
__ihk_mc_spinlock_unlock(&kmsg_buf.lock, irqflags);
}
/* Caller must hold kmsg_lock! */
int __kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
va_list va;
unsigned long flags;
char buf[KPRINTF_LOCAL_BUF_LEN];
flags = __ihk_mc_spinlock_lock(&kmsg_lock);
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -100,16 +125,58 @@ int kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
__ihk_mc_spinlock_unlock(&kmsg_lock, flags);
kprintf_unlock_head(flags_head);
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_tail, flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
return len;
}
void kmsg_init(void)
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);
kmsg_buf.tail = 0;
kmsg_buf.len = sizeof(kmsg_buf.str);
kmsg_buf.head = 0;
kmsg_buf.mode = mode;
ihk_mc_spinlock_init(&kmsg_buf.lock);
memset(kmsg_buf.str, 0, kmsg_buf.len);
}

View File

@ -78,51 +78,52 @@ static struct memobj *to_memobj(struct devobj *devobj)
/***********************************************************************
* devobj
*/
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags)
{
ihk_mc_user_context_t ctx;
struct pager_map_result result; // XXX: assumes contiguous physical
int error;
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
goto out;
}
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = len;
ihk_mc_syscall_arg3(&ctx) = off;
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
dkprintf("devobj_create:handle: %lx\n", result.handle);
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -140,11 +141,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out:
if (obj) {
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(obj);
}
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
return error;
}
@ -164,6 +166,8 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL;
uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
@ -192,7 +196,7 @@ static void devobj_release(struct memobj *memobj)
}
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(free_obj);
}
@ -204,7 +208,7 @@ static void devobj_release(struct memobj *memobj)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
const off_t pgoff = off / PAGE_SIZE;
struct devobj *obj = to_devobj(memobj);
int error;
uintptr_t pfn;
@ -216,7 +220,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
goto out;
}
ix = pgoff - obj->pfn_pgoff;

View File

@ -47,6 +47,7 @@ static memobj_get_page_func_t fileobj_get_page;
static memobj_copy_page_func_t fileobj_copy_page;
static memobj_flush_page_func_t fileobj_flush_page;
static memobj_invalidate_page_func_t fileobj_invalidate_page;
static memobj_lookup_page_func_t fileobj_lookup_page;
static struct memobj_ops fileobj_ops = {
.release = &fileobj_release,
@ -55,6 +56,7 @@ static struct memobj_ops fileobj_ops = {
.copy_page = &fileobj_copy_page,
.flush_page = &fileobj_flush_page,
.invalidate_page = &fileobj_invalidate_page,
.lookup_page = &fileobj_lookup_page,
};
static struct fileobj *to_fileobj(struct memobj *memobj)
@ -180,7 +182,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp)
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("fileobj_create(%d):create failed. %d\n", fd, error);
dkprintf("fileobj_create(%d):create failed. %d\n", fd, error);
goto out;
}
@ -609,3 +611,37 @@ out:
memobj, phys, pgsize, error);
return error;
}
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
uintptr_t phys = -1;
struct page *page;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
}
page = page_list_lookup(obj, off);
if (!page) {
error = -ENOENT;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
obj, off, p2align, physp, error, phys);
return error;
}

View File

@ -79,8 +79,6 @@
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
extern struct sigpending *hassigpending(struct thread *thread);
int futex_cmpxchg_enabled;
/**
@ -153,7 +151,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*/
static void get_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
@ -163,7 +161,7 @@ static void get_futex_key_refs(union futex_key *key)
*/
static void drop_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
/**
@ -185,6 +183,7 @@ static int
get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
unsigned long phys;
struct process_vm *mm = cpu_local_var(current)->vm;
/*
@ -203,15 +202,31 @@ get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key);
return 0;
}
/* RIKEN: No shared futex support... */
return -EFAULT;
key->both.offset |= FUT_OFF_MMSHARED;
retry_v2p:
/* Just use physical address of page, McKernel does not do swapping */
if (ihk_mc_pt_virt_to_phys(mm->address_space->page_table,
(void *)uaddr, &phys)) {
/* Check if we can fault in page */
if (page_fault_process_vm(mm, uaddr, PF_POPULATE | PF_WRITE | PF_USER)) {
kprintf("error: get_futex_key() virt to phys translation failed\n");
return -EFAULT;
}
goto retry_v2p;
}
key->shared.phys = (void *)phys;
key->shared.pgoff = 0;
return 0;
}
@ -234,7 +249,7 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
/* RIKEN: futexes are always on not swappable pages */
*dest = *from;
*dest = getint_user((int *)from);
return 0;
}
@ -265,6 +280,7 @@ static void wake_futex(struct futex_q *q)
barrier();
q->lock_ptr = NULL;
dkprintf("wake_futex(): waking up tid %d\n", p->tid);
sched_wakeup_thread(p, PS_NORMAL);
}
@ -667,12 +683,16 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
/* RIKEN: use mcos timers */
if (timeout) {
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
time_remain = schedule_timeout(timeout);
}
else {
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
schedule();
time_remain = 0;
}
dkprintf("futex_wait_queue_me(): tid: %d woken up\n", cpu_local_var(current)->tid);
}
/* This does not need to be serialized */
@ -777,10 +797,10 @@ retry:
if (timeout && !time_remain)
goto out_put_key;
if(hassigpending(cpu_local_var(current))){
if (hassigpending(cpu_local_var(current))) {
ret = -EINTR;
goto out_put_key;
}
}
/* RIKEN: no signals */
put_futex_key(fshared, &q.key);
@ -793,17 +813,10 @@ out:
}
int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
uint32_t *uaddr2, uint32_t val2, uint32_t val3)
uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared)
{
int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
int fshared = 0;
/* RIKEN: Assume address space private futexes.
if (!(op & FUTEX_PRIVATE_FLAG)) {
fshared = 1;
}
*/
clockrt = op & FUTEX_CLOCK_REALTIME;
if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
@ -824,8 +837,7 @@ int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
0);
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);

View File

@ -30,6 +30,7 @@
#include <mman.h>
#include <init.h>
#include <kmalloc.h>
#include <sysfs.h>
//#define DEBUG_PRINT_HOST
@ -84,15 +85,17 @@ int prepare_process_ranges_args_envs(struct thread *thread,
struct process *proc = thread->proc;
struct process_vm *vm = proc->vm;
struct address_space *as = vm->address_space;
long aout_base;
int error;
n = p->num_sections;
aout_base = (pn->reloc)? vm->region.map_end: 0;
for (i = 0; i < n; i++) {
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
interp_obase = pn->sections[i].vaddr;
interp_obase -= (interp_obase % pn->interp_align);
interp_nbase = vm->region.map_start;
interp_nbase = vm->region.map_end;
interp_nbase = (interp_nbase + pn->interp_align - 1)
& ~(pn->interp_align - 1);
}
@ -102,6 +105,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
pn->sections[i].vaddr += interp_nbase;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
else{
pn->sections[i].vaddr += aout_base;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
s = (pn->sections[i].vaddr) & PAGE_MASK;
e = (pn->sections[i].vaddr + pn->sections[i].len
+ PAGE_SIZE - 1) & PAGE_MASK;
@ -117,7 +124,8 @@ int prepare_process_ranges_args_envs(struct thread *thread,
}
up = virt_to_phys(up_v);
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0) != 0) {
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
PAGE_SHIFT) != 0) {
ihk_mc_free_pages(up_v, range_npages);
kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err;
@ -170,6 +178,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
(e > vm->region.data_end ?
e : vm->region.data_end);
}
if (aout_base) {
vm->region.map_end = e;
}
}
if (interp_nbase != (uintptr_t)-1) {
@ -181,6 +193,11 @@ int prepare_process_ranges_args_envs(struct thread *thread,
pn->entry);
}
if (aout_base) {
pn->at_phdr += aout_base;
pn->at_entry += aout_base;
}
vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
/* Map, copy and update args and envs */
@ -196,7 +213,7 @@ int prepare_process_ranges_args_envs(struct thread *thread,
args_envs_p = virt_to_phys(args_envs);
if(add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0) != 0){
flags, NULL, 0, PAGE_SHIFT) != 0){
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
kprintf("ERROR: adding memory range for args/envs\n");
goto err;
@ -227,9 +244,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
p->args_len = args_len;
}
dkprintf("args copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("args copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs, args_envs_r, p->args_len + 8);
memcpy_long(args_envs, args_envs_r, p->args_len + sizeof(long) - 1);
/* Only unmap remote address if it wasn't specified as an argument */
if (!args) {
@ -262,9 +279,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
p->envs_len = envs_len;
}
dkprintf("envs copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("envs copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + 8);
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + sizeof(long) - 1);
/* Only map remote address if it wasn't specified as an argument */
if (!envs) {
@ -274,10 +291,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
flush_tlb();
// Update variables
argc = *((int*)(args_envs));
argc = *((long *)(args_envs));
dkprintf("argc: %d\n", argc);
argv = (char **)(args_envs + (sizeof(int)));
argv = (char **)(args_envs + (sizeof(long)));
if(proc->saved_cmdline){
kfree(proc->saved_cmdline);
proc->saved_cmdline_len = 0;
@ -294,20 +311,31 @@ int prepare_process_ranges_args_envs(struct thread *thread,
*a = (char *)addr + (unsigned long)*a; // Process' address space!
}
envc = *((int*)(args_envs + p->args_len));
envc = *((long *)(args_envs + p->args_len));
dkprintf("envc: %d\n", envc);
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
while (*env) {
char **_env = env;
//dkprintf("%s\n", args_envs + p->args_len + (unsigned long)*env);
*env = (char *)addr + p->args_len + (unsigned long)*env;
env = ++_env;
}
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
dkprintf("env OK\n");
if (pn->enable_vdso) {
error = arch_map_vdso(vm);
if (error) {
kprintf("ERROR: mapping vdso pages. %d\n", error);
goto err;
}
}
else {
vm->vdso_addr = NULL;
}
p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table);
@ -348,10 +376,16 @@ static int process_msg_prepare_process(unsigned long rphys)
}
n = p->num_sections;
if (n > 16) {
kprintf("%s: ERROR: more ELF sections than 16??\n",
__FUNCTION__);
return -ENOMEM;
}
dkprintf("# of sections: %d\n", n);
if((pn = ihk_mc_allocate(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n, IHK_MC_AP_NOWAIT)) == NULL){
if((pn = kmalloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n,
IHK_MC_AP_NOWAIT)) == NULL){
ihk_mc_unmap_virtual(p, npages, 0);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -360,7 +394,7 @@ static int process_msg_prepare_process(unsigned long rphys)
+ sizeof(struct program_image_section) * n);
if((thread = create_thread(p->entry)) == NULL){
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
@ -379,11 +413,23 @@ static int process_msg_prepare_process(unsigned long rphys)
proc->egid = pn->cred[5];
proc->sgid = pn->cred[6];
proc->fsgid = pn->cred[7];
proc->termsig = SIGCHLD;
vm->region.user_start = pn->user_start;
vm->region.user_end = pn->user_end;
vm->region.map_start = (USER_END / 3) & LARGE_PAGE_MASK;
vm->region.map_end = proc->vm->region.map_start;
if(vm->region.user_end > USER_END)
vm->region.user_end = USER_END;
if(vm->region.user_start != 0UL ||
vm->region.user_end < TASK_UNMAPPED_BASE){
vm->region.map_start =
(vm->region.user_start +
(vm->region.user_end - vm->region.user_start) / 3) &
LARGE_PAGE_MASK;
}
else{
vm->region.map_start = TASK_UNMAPPED_BASE;
}
vm->region.map_end = vm->region.map_start;
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
/* TODO: Clear it at the proper timing */
@ -398,7 +444,7 @@ static int process_msg_prepare_process(unsigned long rphys)
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
vm->address_space->page_table);
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
@ -406,7 +452,7 @@ static int process_msg_prepare_process(unsigned long rphys)
return 0;
err:
ihk_mc_free(pn);
kfree(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
destroy_thread(thread);
@ -415,7 +461,7 @@ err:
static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_params *lparam)
{
lparam->response_va = allocate_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_va = ihk_mc_alloc_pages(RESPONSE_PAGE_COUNT, 0);
lparam->response_pa = virt_to_phys(lparam->response_va);
pcp->request_page = 0;
@ -425,7 +471,7 @@ static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_para
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
{
struct ikc_scd_init_param *param = (void *)pphys;
struct ikc_scd_init_param *param = phys_to_virt(pphys);
struct syscall_params *lparam;
enum ihk_mc_pt_attribute attr;
@ -484,21 +530,42 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
}
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
{
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
int rc;
struct mcs_rwlock_node_irqsave lock;
struct thread *thread;
struct process *proc;
struct mcctrl_signal {
@ -510,22 +577,17 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
} *sp, info;
unsigned long pp;
int cpuid;
int ret = 0;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
dkprintf("SCD_MSG_INIT_CHANNEL_ACKED\n");
process_msg_init_acked(c, packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_PREPARE_PROCESS:
if (find_command_line("memdebug")) {
memcheckall();
if (runcount)
freecheck(runcount);
runcount++;
}
if((rc = process_msg_prepare_process(packet->arg)) == 0){
pckt.msg = SCD_MSG_PREPARE_PROCESS_ACKED;
pckt.err = 0;
@ -538,19 +600,21 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
ret = 0;
break;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
kprintf("No CPU available\n");
return -1;
ret = -1;
break;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(thread, 0, cpuid, -1);
settid(thread, 0, cpuid, -1, 0, NULL);
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
@ -558,7 +622,29 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct thread *)packet->arg;
return 0;
ret = 0;
break;
/*
* Used for syscall offload reply message to explicitly schedule in
* the waiting thread
*/
case SCD_MSG_WAKE_UP_SYSCALL_THREAD:
thread = find_thread(0, packet->ttid, &lock);
if (!thread) {
kprintf("%s: WARNING: no thread for SCD reply? TID: %d\n",
__FUNCTION__, packet->ttid);
ret = -EINVAL;
break;
}
thread_unlock(thread, &lock);
dkprintf("%s: SCD_MSG_WAKE_UP_SYSCALL_THREAD: waking up tid %d\n",
__FUNCTION__, packet->ttid);
waitq_wakeup(&thread->scd_wq);
ret = 0;
break;
case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
sp = (struct mcctrl_signal *)ihk_mc_map_virtual(pp, 1, PTATTR_WRITABLE | PTATTR_ACTIVE);
@ -573,20 +659,56 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0;
ret = 0;
break;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid);
return 0;
ret = 0;
break;
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg);
return 0;
ret = 0;
break;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
case SCD_MSG_SYSFS_REQ_RELEASE:
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
ret = 0;
break;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
ret = 0;
break;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
ret = 0;
break;
}
return 0;
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet, c);
return ret;
}
void init_host_syscall_channel(void)

View File

@ -1,6 +1,8 @@
#ifndef _LINUX_AUXVEC_H
#define _LINUX_AUXVEC_H
#include <arch/auxvec.h>
/* Symbolic values for the entries in the auxiliary table
put on the initial stack */
#define AT_NULL 0 /* end of vector */

View File

@ -19,11 +19,13 @@
* CPU Local Storage (cls)
*/
struct malloc_header {
unsigned int check;
struct kmalloc_header {
unsigned int front_magic;
unsigned int cpu_id;
struct malloc_header *next;
unsigned long size;
struct list_head list;
int size; /* The size of this chunk without the header */
unsigned int end_magic;
/* 32 bytes */
};
#include <ihk/lock.h>
@ -38,8 +40,9 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var {
/* malloc */
struct malloc_header free_list;
ihk_spinlock_t free_list_lock;
struct list_head free_list;
struct list_head remote_free_list;
ihk_spinlock_t remote_free_list_lock;
struct thread idle;
struct process idle_proc;
@ -73,6 +76,7 @@ struct cpu_local_var {
int in_interrupt;
int no_preempt;
int timer_enabled;
int kmalloc_initialized;
} __attribute__((aligned(64)));

View File

@ -99,6 +99,8 @@
#ifdef __KERNEL__
#define __user
/* We don't deal with uaccess at the moment, because x86 can access
* userspace directly, we rely on glibc and the app developers.
*/
@ -106,42 +108,14 @@
#include <arch/uaccess.h>
#endif
#include <asm.h>
#include <errno.h>
#define __user
#include <arch-futex.h>
#if 0
#include <arch/processor.h>
#include <arch/system.h>
#endif
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
{
int op = (encoded_op >> 28) & 7;
@ -206,28 +180,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
return ret;
}
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif // __KERNEL__
#endif // _ASM_X86_FUTEX_H
@ -241,13 +193,11 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
struct process_vm;
union futex_key {
#if 0
struct {
unsigned long pgoff;
struct inode *inode;
void *phys;
int offset;
} shared;
#endif
struct {
unsigned long address;
struct process_vm *mm;
@ -261,6 +211,7 @@ union futex_key {
};
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
#define FUT_OFF_MMSHARED 2
extern int futex_init(void);
@ -272,7 +223,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);

View File

@ -0,0 +1,23 @@
/**
* \file rlimit.h
* License details are found in the file LICENSE.
* \brief
* Kinds of resource limit
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef __GENERIC_RLIMIT_H
#define __GENERIC_RLIMIT_H
typedef uint64_t rlim_t;
struct rlimit {
rlim_t rlim_cur; /* Soft limit */
rlim_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#endif

View File

@ -14,7 +14,7 @@
#define INIT_H
extern void arch_init(void);
extern void kmsg_init(void);
extern void kmsg_init(int);
extern void mem_init(void);
extern void ikc_master_init(void);
extern void ap_init(void);
@ -28,6 +28,7 @@ extern void init_host_syscall_channel(void);
extern void init_host_syscall_channel2(void);
extern void sched_init(void);
extern void pc_ap_init(void);
extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name);

View File

@ -32,11 +32,10 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);
void _kfree(void *ptr, char *file, int line);
void *__kmalloc(int size, enum ihk_mc_ap_flag flag);
void __kfree(void *ptr);
void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
void ___kfree(void *ptr);
int _memcheck(void *ptr, char *msg, char *file, int line, int free);
int memcheckall();
int freecheck(int runcount);
void kmalloc_consolidate_free_list(void);
#endif

View File

@ -16,6 +16,6 @@
void kputs(char *buf);
int kprintf(const char *format, ...);
void kmsg_init(void);
void kmsg_init(int);
#endif

View File

@ -92,7 +92,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);
extern long

View File

@ -47,6 +47,7 @@ typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, u
typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align);
typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_lookup_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag);
struct memobj_ops {
memobj_release_func_t * release;
@ -55,6 +56,7 @@ struct memobj_ops {
memobj_copy_page_func_t * copy_page;
memobj_flush_page_func_t * flush_page;
memobj_invalidate_page_func_t * invalidate_page;
memobj_lookup_page_func_t * lookup_page;
};
static inline void memobj_release(struct memobj *obj)
@ -106,6 +108,15 @@ static inline int memobj_invalidate_page(struct memobj *obj, uintptr_t phys,
return 0;
}
static inline int memobj_lookup_page(struct memobj *obj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
if (obj->ops->lookup_page) {
return (*obj->ops->lookup_page)(obj, off, p2align, physp, pflag);
}
return -ENXIO;
}
static inline void memobj_lock(struct memobj *obj)
{
ihk_mc_spinlock_lock_noirq(&obj->lock);
@ -130,6 +141,7 @@ int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
struct shmid_ds;
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
int zeroobj_create(struct memobj **objp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags);
#endif /* HEADER_MEMOBJ_H */

View File

@ -29,6 +29,7 @@
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
#define VR_DONTFORK 0x800
#define VR_DEMAND_PAGING 0x1000
#define VR_PRIVATE 0x2000
#define VR_LOCKED 0x4000
@ -160,7 +161,9 @@
#endif
#define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25
#define KERNEL_STACK_NR_PAGES 32
#define NOPHYS ((uintptr_t)-1)
#include <waitq.h>
#include <futex.h>
@ -216,9 +219,11 @@ struct thread_hash {
struct address_space {
struct page_table *page_table;
int type;
#define ADDRESS_SPACE_NORMAL 1
#define ADDRESS_SPACE_PVAS 2
void *opt;
void (*free_cb)(struct address_space *, void *);
ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int nslots;
int pids[];
};
@ -288,7 +293,7 @@ struct user
unsigned long int u_debugreg [8];
};
#define AUXV_LEN 16
#define AUXV_LEN 18
struct vm_range {
struct list_head list;
@ -296,6 +301,8 @@ struct vm_range {
unsigned long flag;
struct memobj *memobj;
off_t objoff;
int pgshift; /* page size. 0 means THP */
int padding;
};
struct vm_regions {
@ -310,18 +317,25 @@ struct vm_regions {
struct process_vm;
struct sigfd {
struct sigfd *next;
struct mckfd {
struct mckfd *next;
int fd;
__sigset_t mask;
int sig_no;
long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*fcntl_cb)(struct mckfd *, ihk_mc_user_context_t *);
};
#define SFD_CLOEXEC 02000000
#define SFD_NONBLOCK 04000
struct sig_common {
ihk_spinlock_t lock;
ihk_atomic_t use;
struct sigfd *sigfd;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
};
@ -335,13 +349,18 @@ struct sig_pending {
typedef void pgio_func_t(void *arg);
struct mcexec_tid {
int tid;
struct thread *thread;
};
/* Represents a node in the process fork tree, it may exist even after the
* corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a
* special "init" process */
struct process {
struct list_head hash_list;
mcs_rwlock_lock_t update_lock; // lock for parent, status, ...?
mcs_rwlock_lock_t update_lock; // lock for parent, status, cpu time...
// process vm
struct process_vm *vm;
@ -349,6 +368,9 @@ struct process {
// threads and children
struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* TID set of proxy process */
struct mcexec_tid *tids;
int nr_tids;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
@ -398,6 +420,7 @@ struct process {
int fsgid;
int execed;
int nohost;
int nowait;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long saved_auxv[AUXV_LEN];
char *saved_cmdline;
@ -422,6 +445,27 @@ struct process {
/* Store signal sent to parent when the process terminates. */
int termsig;
ihk_spinlock_t mckfd_lock;
struct mckfd *mckfd;
// cpu time (summary)
struct timespec stime;
struct timespec utime;
// cpu time (children)
struct timespec stime_children;
struct timespec utime_children;
long maxrss;
long maxrss_children;
// perf_event
int perf_status;
#define PP_NONE 0
#define PP_RESET 1
#define PP_COUNT 2
#define PP_STOP 3
struct mc_perf_event *monitoring_event;
};
void hold_thread(struct thread *ftn);
@ -509,6 +553,23 @@ struct thread {
unsigned long *ptrace_debugreg; /* debug registers for ptrace */
struct sig_pending *ptrace_recvsig;
struct sig_pending *ptrace_sendsig;
// cpu time
struct timespec stime;
struct timespec utime;
struct timespec btime;
int times_update;
int in_kernel;
// interval timers
int itimer_enabled;
struct itimerval itimer_virtual;
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
/* Syscall offload wait queue head */
struct waitq scd_wq;
};
struct process_vm {
@ -516,6 +577,10 @@ struct process_vm {
struct list_head vm_range_list;
struct vm_regions region;
struct process *proc; /* process that reside on the same page */
void *opt;
void (*free_cb)(struct process_vm *, void *);
void *vdso_addr;
void *vvar_addr;
ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock;
@ -526,12 +591,25 @@ struct process_vm {
// is protected by its own lock (see ihk/manycore/generic/page_alloc.c)
ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int exiting;
long currss;
};
static inline int has_cap_ipc_lock(struct thread *th)
{
/* CAP_IPC_LOCK (= 14) */
return !(th->proc->euid);
}
static inline int has_cap_sys_admin(struct thread *th)
{
/* CAP_SYS_ADMIN (= 21) */
return !(th->proc->euid);
}
void hold_address_space(struct address_space *);
void release_address_space(struct address_space *);
struct thread *create_thread(unsigned long user_pc);
struct thread *clone_thread(struct thread *org, unsigned long pc,
unsigned long sp, int clone_flags);
@ -549,7 +627,7 @@ int populate_process_memory(struct process_vm *vm, void *start, size_t len);
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t objoff);
struct memobj *memobj, off_t objoff, int pgshift);
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process_vm *vm,
@ -610,5 +688,9 @@ void process_unlock(struct process *proc, struct mcs_rwlock_node_irqsave *lock);
void chain_process(struct process *);
void chain_thread(struct thread *);
void proc_init();
void set_timer();
struct sig_pending *hassigpending(struct thread *thread);
void settid(struct thread *thread, int mode, int newcpuid, int oldcpuid,
int nr_tids, int *tids);
#endif

View File

@ -25,6 +25,7 @@ enum {
IPC_CREAT = 01000,
IPC_EXCL = 02000,
SHM_HUGETLB = 04000,
SHM_RDONLY = 010000,
SHM_RND = 020000,
SHM_REMAP = 040000,
@ -46,11 +47,14 @@ enum {
SHM_INFO = 14,
};
struct shmlock_user;
struct shmobj {
struct memobj memobj; /* must be first */
int index;
uint8_t padding[4];
int pgshift;
size_t real_segsz;
struct shmlock_user * user;
struct shmid_ds ds;
struct list_head page_list;
struct list_head chain; /* shmobj_list */
@ -75,9 +79,33 @@ struct shm_info {
uint64_t swap_successes;
};
struct shmlock_user {
uid_t ruid;
int padding;
size_t locked;
struct list_head chain;
};
extern ihk_spinlock_t shmlock_users_lock_body;
static inline void shmlock_users_lock(void)
{
ihk_mc_spinlock_lock_noirq(&shmlock_users_lock_body);
return;
}
static inline void shmlock_users_unlock(void)
{
ihk_mc_spinlock_unlock_noirq(&shmlock_users_lock_body);
return;
}
void shmobj_list_lock(void);
void shmobj_list_unlock(void);
int shmobj_create_indexed(struct shmid_ds *ds, struct shmobj **objp);
void shmobj_destroy(struct shmobj *obj);
void shmlock_user_free(struct shmlock_user *user);
int shmlock_user_get(uid_t ruid, struct shmlock_user **userp);
#endif /* HEADER_SHM_H */

View File

@ -31,6 +31,7 @@
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
@ -38,6 +39,10 @@
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
@ -46,10 +51,28 @@
#define SCD_MSG_DEBUG_LOG 0x20
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
/* Cloning flags. */
# define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */
@ -94,14 +117,6 @@ struct user_desc {
unsigned int useable:1;
unsigned int lm:1;
};
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
};
struct program_image_section {
unsigned long vaddr;
@ -143,6 +158,9 @@ struct program_load_desc {
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -171,13 +189,58 @@ struct ikc_scd_init_param {
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -278,6 +341,7 @@ struct procfs_read {
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
@ -287,6 +351,29 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
#define RUSAGE_SELF 0
#define RUSAGE_CHILDREN -1
#define RUSAGE_THREAD 1
struct rusage {
struct timeval ru_utime;
struct timeval ru_stime;
long ru_maxrss;
long ru_ixrss;
long ru_idrss;
long ru_isrss;
long ru_minflt;
long ru_majflt;
long ru_nswap;
long ru_inblock;
long ru_oublock;
long ru_msgsnd;
long ru_msgrcv;
long ru_nsignals;
long ru_nvcsw;
long ru_nivcsw;
};
extern void terminate(int, int);
struct tod_data_s {
@ -298,4 +385,50 @@ struct tod_data_s {
};
extern struct tod_data_s tod_data; /* residing in arch-dependent file */
void reset_cputime();
void set_cputime(int mode);
intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd,
off_t off0);
void clear_host_pte(uintptr_t addr, size_t len);
typedef int32_t key_t;
int do_shmget(key_t key, size_t size, int shmflg);
struct process_vm;
int arch_map_vdso(struct process_vm *vm); /* arch dependent */
int arch_setup_vdso(void);
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
#if 0
wait_queue_head_t wq;
#endif
};
#endif

71
kernel/include/sysfs.h Normal file
View File

@ -0,0 +1,71 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_H
#define MCKERNEL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_ops {
ssize_t (*show)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
void (*release)(struct sysfs_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfs_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
const char *fmt, ...);
extern int sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...);
extern int sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...);
extern int sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...);
extern int sysfs_unlinkf(int flags, const char *fmt, ...);
extern void sysfs_init(void);
struct ihk_ikc_channel_desc;
extern void sysfss_packet_handler(struct ihk_ikc_channel_desc *ch, int msg,
int error, long arg1, long arg2, long arg3);
#endif /* MCKERNEL_SYSFS_H */

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -20,6 +20,10 @@
#define __TIME_H
#define NS_PER_SEC 1000000000UL
#define CLOCK_REALTIME 0
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
typedef long int __time_t;
@ -49,5 +53,72 @@ struct timezone
int tz_dsttime; /* Nonzero if DST is ever in effect. */
};
#define ITIMER_REAL 0
#define ITIMER_VIRTUAL 1
#define ITIMER_PROF 2
struct itimerval {
struct timeval it_interval;
struct timeval it_value;
};
static inline void
ts_add(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_nsec += bts->tv_nsec;
while(ats->tv_nsec >= 1000000000){
ats->tv_sec++;
ats->tv_nsec -= 1000000000;
}
}
static inline void
ts_sub(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_nsec -= bts->tv_nsec;
while(ats->tv_nsec < 0){
ats->tv_sec--;
ats->tv_nsec += 1000000000;
}
}
static inline void
tv_add(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_usec += bts->tv_usec;
while(ats->tv_usec >= 1000000){
ats->tv_sec++;
ats->tv_usec -= 1000000;
}
}
static inline void
tv_sub(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_usec -= bts->tv_usec;
while(ats->tv_usec < 0){
ats->tv_sec--;
ats->tv_usec += 1000000;
}
}
static inline void
tv_to_ts(struct timespec *ats, const struct timeval *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_nsec = bts->tv_usec * 1000;
}
static inline void
ts_to_tv(struct timeval *ats, const struct timespec *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_usec = bts->tv_nsec / 1000;
}
#endif // __TIME_H

View File

@ -30,6 +30,7 @@
#include <init.h>
#include <cls.h>
#include <syscall.h>
#include <sysfs.h>
//#define IOCTL_FUNC_EXTENSION
#ifdef IOCTL_FUNC_EXTENSION
@ -207,6 +208,7 @@ static void time_init(void)
{
unsigned long tv_sec, tv_nsec;
unsigned long ns_per_kclock;
unsigned long tsc;
ihk_mc_get_boot_time(&tv_sec, &tv_nsec);
ns_per_kclock = ihk_mc_get_ns_per_tsc();
@ -216,6 +218,15 @@ static void time_init(void)
if (ns_per_kclock) {
tod_data.clocks_per_sec = (1000L * NS_PER_SEC) / ns_per_kclock;
tsc = rdtsc();
tod_data.origin.tv_sec -= tsc / tod_data.clocks_per_sec;
tod_data.origin.tv_nsec -= NS_PER_SEC * (tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
if (tod_data.origin.tv_nsec < 0) {
--tod_data.origin.tv_sec;
tod_data.origin.tv_nsec += NS_PER_SEC;
}
}
if (!ns_per_kclock) {
@ -250,9 +261,70 @@ static void rest_init(void)
sched_init();
}
static void setup_remote_snooping_samples(void)
{
static long lvalue = 0xf123456789abcde0;
static char *svalue = "string(remote)";
int error;
struct sysfs_bitmap_param param;
error = sysfs_createf(SYSFS_SNOOPING_OPS_d32, &lvalue, 0444, "/sys/test/remote/d32");
if (error) {
panic("setup_remote_snooping_samples: d32");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_d64, &lvalue, 0444, "/sys/test/remote/d64");
if (error) {
panic("setup_remote_snooping_samples: d64");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u32, &lvalue, 0444, "/sys/test/remote/u32");
if (error) {
panic("setup_remote_snooping_samples: u32");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u64, &lvalue, 0444, "/sys/test/remote/u64");
if (error) {
panic("setup_remote_snooping_samples: u64");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_s, svalue, 0444, "/sys/test/remote/s");
if (error) {
panic("setup_remote_snooping_samples: s");
}
param.nbits = 40;
param.ptr = &lvalue;
error = sysfs_createf(SYSFS_SNOOPING_OPS_pbl, &param, 0444, "/sys/test/remote/pbl");
if (error) {
panic("setup_remote_snooping_samples: pbl");
}
param.nbits = 40;
param.ptr = &lvalue;
error = sysfs_createf(SYSFS_SNOOPING_OPS_pb, &param, 0444, "/sys/test/remote/pb");
if (error) {
panic("setup_remote_snooping_samples: pb");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u32K, &lvalue, 0444, "/sys/test/remote/u32K");
if (error) {
panic("setup_remote_snooping_samples: u32K");
}
return;
} /* setup_remote_snooping_samples() */
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0;
extern int num_processors;
extern void zero_tsc(void);
static void post_init(void)
{
@ -271,13 +343,12 @@ static void post_init(void)
ihk_mc_spinlock_init(&syscall_lock);
}
/* Zero TSC.
* All AP cores are wait spinning for ap_start() and they will zero
* their TSC immediatly. */
zero_tsc();
arch_setup_vdso();
arch_start_pvclock();
ap_start();
create_os_procfs_files();
sysfs_init();
populate_sysfs();
}
#ifdef DCFA_RUN
extern void user_main();
@ -290,9 +361,17 @@ extern void ibmic_cmd_init(void);
int main(void)
{
kmsg_init();
char *ptr;
int mode = 0;
kputs("MCK started.\n");
ptr = find_command_line("ksyslogd=");
if (ptr) {
mode = ptr[9] - 0x30;
if (mode < 0 || mode > 2) mode = 0;
}
kmsg_init(mode);
kputs("IHK/McKernel started.\n");
arch_init();
@ -314,7 +393,7 @@ int main(void)
futex_init();
kputs("MCK/IHK booted.\n");
kputs("IHK/McKernel booted.\n");
#ifdef DCFA_KMOD
mc_cmd_client_init();

View File

@ -17,6 +17,7 @@
#include <ihk/debug.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <arch/cpu.h>
//#define DEBUG_LISTENERS
@ -28,16 +29,6 @@
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
void testmem(void *v, unsigned long size)
{
unsigned long i, st, ed, s = 0;

View File

@ -52,7 +52,8 @@ static struct ihk_page_allocator_desc *pa_allocator;
static unsigned long pa_start, pa_end;
static struct page *pa_pages;
extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt);
extern void unhandled_page_fault(struct thread *, void *, void *);
extern int interrupt_from_user(void *);
struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE];
@ -155,13 +156,17 @@ void sbox_write(int offset, unsigned int value);
static void query_free_mem_interrupt_handler(void *priv)
{
#ifdef ATTACHED_MIC
dkprintf("query free mem handler!\n");
int pages = ihk_pagealloc_query_free(pa_allocator);
dkprintf("free pages: %d\n", pages);
kprintf("McKernel free pages: %d\n", pages);
if (find_command_line("memdebug")) {
extern void kmalloc_memcheck(void);
kmalloc_memcheck();
}
#ifdef ATTACHED_MIC
sbox_write(SBOX_SCRATCH0, pages);
sbox_write(SBOX_SCRATCH1, 1);
#endif
@ -209,61 +214,6 @@ void coredump(struct thread *thread, void *regs)
freecore(&coretable);
}
static void unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
{
const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
irqflags = kprintf_lock();
dkprintf("[%d] Page fault for 0x%lX\n",
ihk_mc_get_processor_id(), address);
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
(error & PF_USER ? "user" : "kernel"),
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
dkprintf("address is in range, flag: 0x%X! \n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
dkprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
//dkprintf("now dump a core file\n");
//coredump(proc, regs);
#ifdef DEBUG_PRINT_MEM
{
uint64_t *sp = (void *)REGS_GET_STACK_POINTER(regs);
kprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n",
sp[0], sp[1], sp[2], sp[3]);
}
#endif
return;
}
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id)
{
@ -285,9 +235,9 @@ void remote_flush_tlb_cpumask(struct process_vm *vm,
/* Take a copy of the cpu set so that we don't hold the lock
* all the way while interrupting other cores */
ihk_mc_spinlock_lock_noirq(&vm->cpu_set_lock);
memcpy(&_cpu_set, &vm->cpu_set, sizeof(cpu_set_t));
ihk_mc_spinlock_unlock_noirq(&vm->cpu_set_lock);
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
@ -319,6 +269,13 @@ void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long tsc;
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
#endif
if (flush_entry->addr) {
flush_tlb_single(flush_entry->addr & PAGE_MASK);
}
/* Zero address denotes full TLB flush */
else {
flush_tlb();
}
/* Wait for all cores */
while (ihk_atomic_read(&flush_entry->pending) != 0) {
@ -369,6 +326,7 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
struct thread *thread = cpu_local_var(current);
int error;
set_cputime(interrupt_from_user(regs)? 1: 2);
dkprintf("[%d]page_fault_handler(%p,%lx,%p)\n",
ihk_mc_get_processor_id(), fault_addr, reason, regs);
@ -388,10 +346,9 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
// no return
}
kprintf("[%d]page_fault_handler(%p,%lx,%p):"
"fault vm failed. %d, TID: %d\n",
ihk_mc_get_processor_id(), fault_addr,
reason, regs, error, thread->tid);
kprintf("%s fault VM failed for TID: %d, addr: 0x%lx, "
"reason: %d, error: %d\n", __FUNCTION__,
thread->tid, fault_addr, reason, error);
unhandled_page_fault(thread, fault_addr, regs);
preempt_enable();
memset(&info, '\0', sizeof info);
@ -416,7 +373,10 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
info._sifields._sigfault.si_addr = fault_addr;
set_signal(SIGSEGV, regs, &info);
}
check_signal(0, regs, 0);
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
}
goto out;
}
@ -427,6 +387,7 @@ out:
ihk_mc_get_processor_id(), fault_addr, reason,
regs, error);
check_need_resched();
set_cputime(0);
return;
}
@ -474,8 +435,9 @@ static void page_allocator_init(void)
ihk_mc_reserve_arch_pages(pa_start, pa_end, reserve_pages);
kprintf("Available pages: %ld pages\n",
ihk_pagealloc_count(pa_allocator));
kprintf("Available memory: %ld bytes in %ld pages\n",
(ihk_pagealloc_count(pa_allocator) * PAGE_SIZE),
ihk_pagealloc_count(pa_allocator));
/* Notify the ihk to use my page allocator */
ihk_mc_set_page_allocator(&allocator);
@ -556,6 +518,9 @@ static void page_init(void)
static char *memdebug = NULL;
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag);
static void ___kfree(void *ptr);
void register_kmalloc(void)
{
if(memdebug){
@ -685,60 +650,100 @@ void mem_init(void)
}
}
struct location {
struct location *next;
int line;
int cnt;
char file[0];
};
#define KMALLOC_TRACK_HASH_SHIFT (8)
#define KMALLOC_TRACK_HASH_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_TRACK_HASH_MASK (KMALLOC_TRACK_HASH_SIZE - 1)
struct alloc {
struct alloc *next;
struct malloc_header *p;
struct location *loc;
int size;
struct list_head kmalloc_track_hash[KMALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t kmalloc_track_hash_locks[KMALLOC_TRACK_HASH_SIZE];
struct list_head kmalloc_addr_hash[KMALLOC_TRACK_HASH_SIZE];
ihk_spinlock_t kmalloc_addr_hash_locks[KMALLOC_TRACK_HASH_SIZE];
int kmalloc_track_initialized = 0;
int kmalloc_runcount = 0;
struct kmalloc_track_addr_entry {
void *addr;
int runcount;
struct list_head list; /* track_entry's list */
struct kmalloc_track_entry *entry;
struct list_head hash; /* address hash */
};
#define HASHNUM 129
struct kmalloc_track_entry {
char *file;
int line;
int size;
ihk_atomic_t alloc_count;
struct list_head hash;
struct list_head addr_list;
ihk_spinlock_t addr_list_lock;
};
static struct alloc *allochash[HASHNUM];
static struct location *lochash[HASHNUM];
static ihk_spinlock_t alloclock;
int runcount;
static unsigned char *page;
static int space;
static void *dalloc(unsigned long size)
void kmalloc_init(void)
{
void *r;
static int pos = 0;
unsigned long irqstate;
struct cpu_local_var *v = get_this_cpu_local_var();
irqstate = ihk_mc_spinlock_lock(&alloclock);
size = (size + 7) & 0xfffffffffffffff8L;
if (pos + size > space) {
page = allocate_pages(1, IHK_MC_AP_NOWAIT);
space = 4096;
pos = 0;
register_kmalloc();
INIT_LIST_HEAD(&v->free_list);
INIT_LIST_HEAD(&v->remote_free_list);
ihk_mc_spinlock_init(&v->remote_free_list_lock);
v->kmalloc_initialized = 1;
if (!kmalloc_track_initialized) {
int i;
memdebug = find_command_line("memdebug");
kmalloc_track_initialized = 1;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
ihk_mc_spinlock_init(&kmalloc_track_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_track_hash[i]);
ihk_mc_spinlock_init(&kmalloc_addr_hash_locks[i]);
INIT_LIST_HEAD(&kmalloc_addr_hash[i]);
}
}
r = page + pos;
pos += size;
ihk_mc_spinlock_unlock(&alloclock, irqstate);
return r;
}
/* NOTE: Hash lock must be held */
struct kmalloc_track_entry *__kmalloc_track_find_entry(
int size, char *file, int line)
{
struct kmalloc_track_entry *entry_iter, *entry = NULL;
int hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
list_for_each_entry(entry_iter, &kmalloc_track_hash[hash], hash) {
if (!strcmp(entry_iter->file, file) &&
entry_iter->size == size &&
entry_iter->line == line) {
entry = entry_iter;
break;
}
}
if (entry) {
dkprintf("%s found entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
else {
dkprintf("%s couldn't find entry %s:%d size: %d\n", __FUNCTION__,
file, line, size);
}
return entry;
}
/* Top level routines called from macro */
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
{
char *r = ___kmalloc(size, flag);
struct malloc_header *h;
unsigned long hash;
char *t;
struct location *lp;
struct alloc *ap;
unsigned long alcsize;
unsigned long chksize;
unsigned long irqflags;
struct kmalloc_track_entry *entry;
struct kmalloc_track_addr_entry *addr_entry;
int hash, addr_hash;
void *r = ___kmalloc(size, flag);
if (!memdebug)
return r;
@ -746,177 +751,177 @@ void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line)
if (!r)
return r;
h = ((struct malloc_header *)r) - 1;
alcsize = h->size * sizeof(struct malloc_header);
chksize = alcsize - size;
memset(r + size, '\x5a', chksize);
hash = (strlen(file) + line + size) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
for (hash = 0, t = file; *t; t++) {
hash <<= 1;
hash += *t;
entry = __kmalloc_track_find_entry(size, file, line);
if (!entry) {
entry = ___kmalloc(sizeof(*entry), IHK_MC_AP_NOWAIT);
if (!entry) {
kprintf("%s: ERROR: allocating tracking entry\n");
goto out;
}
entry->line = line;
entry->size = size;
ihk_atomic_set(&entry->alloc_count, 0);
ihk_mc_spinlock_init(&entry->addr_list_lock);
INIT_LIST_HEAD(&entry->addr_list);
entry->file = ___kmalloc(strlen(file) + 1, IHK_MC_AP_NOWAIT);
if (!entry->file) {
kprintf("%s: ERROR: allocating file string\n");
___kfree(entry);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
goto out;
}
strcpy(entry->file, file);
entry->file[strlen(file)] = 0;
list_add(&entry->hash, &kmalloc_track_hash[hash]);
dkprintf("%s entry %s:%d size: %d added\n", __FUNCTION__,
file, line, size);
}
hash += line;
hash %= HASHNUM;
for (lp = lochash[hash]; lp; lp = lp->next)
if (lp->line == line &&
!strcmp(lp->file, file))
break;
if (!lp) {
lp = dalloc(sizeof(struct location) + strlen(file) + 1);
memset(lp, '\0', sizeof(struct location));
lp->line = line;
strcpy(lp->file, file);
do {
lp->next = lochash[hash];
} while (!compare_and_swap(lochash + hash, (unsigned long)lp->next, (unsigned long)lp));
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
ihk_atomic_inc(&entry->alloc_count);
/* Add new addr entry for this allocation entry */
addr_entry = ___kmalloc(sizeof(*addr_entry), IHK_MC_AP_NOWAIT);
if (!addr_entry) {
kprintf("%s: ERROR: allocating addr entry\n");
goto out;
}
hash = (unsigned long)h % HASHNUM;
do {
for (ap = allochash[hash]; ap; ap = ap->next)
if (!ap->p)
break;
} while (ap && !compare_and_swap(&ap->p, 0UL, (unsigned long)h));
if (!ap) {
ap = dalloc(sizeof(struct alloc));
memset(ap, '\0', sizeof(struct alloc));
ap->p = h;
do {
ap->next = allochash[hash];
} while (!compare_and_swap(allochash + hash, (unsigned long)ap->next, (unsigned long)ap));
}
addr_entry->addr = r;
addr_entry->runcount = kmalloc_runcount;
addr_entry->entry = entry;
ap->loc = lp;
ap->size = size;
ap->runcount = runcount;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_add(&addr_entry->list, &entry->addr_list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
return r;
}
/* Add addr entry to address hash */
addr_hash = ((unsigned long)r >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[addr_hash]);
list_add(&addr_entry->hash, &kmalloc_addr_hash[addr_hash]);
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[addr_hash], irqflags);
int _memcheck(void *ptr, char *msg, char *file, int line, int flags)
{
struct malloc_header *h = ((struct malloc_header *)ptr) - 1;
struct malloc_header *next;
unsigned long hash = (unsigned long)h % HASHNUM;
struct alloc *ap;
static unsigned long check = 0x5a5a5a5a5a5a5a5aUL;
unsigned long alcsize;
unsigned long chksize;
if (h->check != 0x5a5a5a5a) {
int i;
unsigned long max = 0;
unsigned long cur = (unsigned long)h;
struct alloc *maxap = NULL;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if ((unsigned long)ap->p < cur &&
(unsigned long)ap->p > max) {
max = (unsigned long)ap->p;
maxap = ap;
}
kprintf("%s: detect buffer overrun, alc=%s:%d size=%ld h=%p, s=%ld\n", msg, maxap->loc->file, maxap->loc->line, maxap->size, maxap->p, maxap->p->size);
kprintf("broken header: h=%p next=%p size=%ld cpu_id=%d\n", h, h->next, h->size, h->cpu_id);
}
for (ap = allochash[hash]; ap; ap = ap->next)
if (ap->p == h)
break;
if (!ap) {
if(file)
kprintf("%s: address not found, %s:%d p=%p\n", msg, file, line, ptr);
else
kprintf("%s: address not found p=%p\n", msg, ptr);
return 1;
}
alcsize = h->size * sizeof(struct malloc_header);
chksize = alcsize - ap->size;
if (chksize > 8)
chksize = 8;
next = (struct malloc_header *)((char *)ptr + alcsize);
if (next->check != 0x5a5a5a5a ||
memcmp((char *)ptr + ap->size, &check, chksize)) {
unsigned long buf = 0x5a5a5a5a5a5a5a5aUL;
unsigned char *p;
unsigned char *q;
memcpy(&buf, (char *)ptr + ap->size, chksize);
p = (unsigned char *)&(next->check);
q = (unsigned char *)&buf;
if (file)
kprintf("%s: broken, %s:%d alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, file, line, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
else
kprintf("%s: broken, alc=%s:%d %02x%02x%02x%02x%02x%02x%02x%02x %02x%02x%02x%02x size=%ld\n", msg, ap->loc->file, ap->loc->line, q[0], q[1], q[2], q[3], q[4], q[5], q[6], q[7], p[0], p[1], p[2], p[3], ap->size);
if (next->check != 0x5a5a5a5a)
kprintf("next->HEADER: next=%p size=%ld cpu_id=%d\n", next->next, next->size, next->cpu_id);
return 1;
}
if(flags & 1){
ap->p = NULL;
ap->loc = NULL;
ap->size = 0;
}
return 0;
}
int memcheckall()
{
int i;
struct alloc *ap;
int r = 0;
for(i = 0; i < HASHNUM; i++)
for(ap = allochash[i]; ap; ap = ap->next)
if(ap->p)
r |= _memcheck(ap->p + 1, "memcheck", NULL, 0, 2);
return r;
}
int freecheck(int runcount)
{
int i;
struct alloc *ap;
struct location *lp;
int r = 0;
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
lp->cnt = 0;
for (i = 0; i < HASHNUM; i++)
for (ap = allochash[i]; ap; ap = ap->next)
if (ap->p && ap->runcount == runcount) {
ap->loc->cnt++;
r++;
}
if (r) {
kprintf("memory leak?\n");
for (i = 0; i < HASHNUM; i++)
for (lp = lochash[i]; lp; lp = lp->next)
if (lp->cnt)
kprintf(" alc=%s:%d cnt=%d\n", lp->file, lp->line, lp->cnt);
}
dkprintf("%s addr_entry %p added\n", __FUNCTION__, r);
out:
return r;
}
void _kfree(void *ptr, char *file, int line)
{
if (memdebug)
_memcheck(ptr, "KFREE", file, line, 1);
unsigned long irqflags;
struct kmalloc_track_entry *entry;
struct kmalloc_track_addr_entry *addr_entry_iter, *addr_entry = NULL;
int hash;
if (!memdebug) {
goto out;
}
hash = ((unsigned long)ptr >> 5) & KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_addr_hash_locks[hash]);
list_for_each_entry(addr_entry_iter,
&kmalloc_addr_hash[hash], hash) {
if (addr_entry_iter->addr == ptr) {
addr_entry = addr_entry_iter;
break;
}
}
if (addr_entry) {
list_del(&addr_entry->hash);
}
ihk_mc_spinlock_unlock(&kmalloc_addr_hash_locks[hash], irqflags);
if (!addr_entry) {
kprintf("%s: ERROR: kfree()ing invalid pointer\n", __FUNCTION__);
panic("panic");
}
entry = addr_entry->entry;
irqflags = ihk_mc_spinlock_lock(&entry->addr_list_lock);
list_del(&addr_entry->list);
ihk_mc_spinlock_unlock(&entry->addr_list_lock, irqflags);
dkprintf("%s addr_entry %p removed\n", __FUNCTION__, addr_entry->addr);
___kfree(addr_entry);
/* Do we need to remove tracking entry as well? */
if (!ihk_atomic_dec_and_test(&entry->alloc_count)) {
goto out;
}
hash = (strlen(entry->file) + entry->line + entry->size) &
KMALLOC_TRACK_HASH_MASK;
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[hash]);
list_del(&entry->hash);
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[hash], irqflags);
dkprintf("%s entry %s:%d size: %d removed\n", __FUNCTION__,
entry->file, entry->line, entry->size);
___kfree(entry->file);
___kfree(entry);
out:
___kfree(ptr);
}
void kmalloc_memcheck(void)
{
int i;
unsigned long irqflags;
struct kmalloc_track_entry *entry = NULL;
for (i = 0; i < KMALLOC_TRACK_HASH_SIZE; ++i) {
irqflags = ihk_mc_spinlock_lock(&kmalloc_track_hash_locks[i]);
list_for_each_entry(entry, &kmalloc_track_hash[i], hash) {
struct kmalloc_track_addr_entry *addr_entry = NULL;
int cnt = 0;
ihk_mc_spinlock_lock_noirq(&entry->addr_list_lock);
list_for_each_entry(addr_entry, &entry->addr_list, list) {
dkprintf("%s memory leak: %p @ %s:%d size: %d runcount: %d\n",
__FUNCTION__,
addr_entry->addr,
entry->file,
entry->line,
entry->size,
addr_entry->runcount);
if (kmalloc_runcount != addr_entry->runcount)
continue;
cnt++;
}
ihk_mc_spinlock_unlock_noirq(&entry->addr_list_lock);
if (!cnt)
continue;
kprintf("%s memory leak: %s:%d size: %d cnt: %d, runcount: %d\n",
__FUNCTION__,
entry->file,
entry->line,
entry->size,
cnt,
kmalloc_runcount);
}
ihk_mc_spinlock_unlock(&kmalloc_track_hash_locks[i], irqflags);
}
++kmalloc_runcount;
}
/* Redirection routines registered in alloc structure */
void *__kmalloc(int size, enum ihk_mc_ap_flag flag)
{
return kmalloc(size, flag);
@ -927,143 +932,199 @@ void __kfree(void *ptr)
kfree(ptr);
}
void kmalloc_init(void)
static void ___kmalloc_insert_chunk(struct list_head *free_list,
struct kmalloc_header *chunk)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list;
ihk_mc_spinlock_init(&v->free_list_lock);
int i;
struct kmalloc_header *chunk_iter, *next_chunk = NULL;
h->check = 0x5a5a5a5a;
h->next = &v->free_list;
h->size = 0;
register_kmalloc();
memdebug = find_command_line("memdebug");
for (i = 0; i < HASHNUM; i++) {
allochash[i] = NULL;
lochash[i] = NULL;
/* Find out where to insert */
list_for_each_entry(chunk_iter, free_list, list) {
if ((void *)chunk < (void *)chunk_iter) {
next_chunk = chunk_iter;
break;
}
}
page = allocate_pages(16, IHK_MC_AP_NOWAIT);
space = 16 * 4096;
ihk_mc_spinlock_init(&alloclock);
/* Add in front of next */
if (next_chunk) {
list_add_tail(&chunk->list, &next_chunk->list);
}
/* Add after the head */
else {
list_add(&chunk->list, free_list);
}
return;
}
static void ___kmalloc_init_chunk(struct kmalloc_header *h, int size)
{
h->size = size;
h->front_magic = 0x5c5c5c5c;
h->end_magic = 0x6d6d6d6d;
h->cpu_id = ihk_mc_get_processor_id();
}
static void ___kmalloc_consolidate_list(struct list_head *list)
{
struct kmalloc_header *chunk_iter, *chunk, *next_chunk;
reiterate:
chunk_iter = NULL;
chunk = NULL;
list_for_each_entry(next_chunk, list, list) {
if (chunk_iter && (((void *)chunk_iter + sizeof(struct kmalloc_header)
+ chunk_iter->size) == (void *)next_chunk)) {
chunk = chunk_iter;
break;
}
chunk_iter = next_chunk;
}
if (!chunk) {
return;
}
chunk->size += (next_chunk->size + sizeof(struct kmalloc_header));
list_del(&next_chunk->list);
goto reiterate;
}
void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
void kmalloc_consolidate_free_list(void)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list, *prev, *p;
int u, req_page;
unsigned long flags;
struct kmalloc_header *chunk, *tmp;
unsigned long irqflags =
ihk_mc_spinlock_lock(&cpu_local_var(remote_free_list_lock));
if (size >= PAGE_SIZE * 4) {
/* Clean up remotely deallocated chunks */
list_for_each_entry_safe(chunk, tmp,
&cpu_local_var(remote_free_list), list) {
list_del(&chunk->list);
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
}
/* Free list lock ensures IRQs are disabled */
___kmalloc_consolidate_list(&cpu_local_var(free_list));
ihk_mc_spinlock_unlock(&cpu_local_var(remote_free_list_lock), irqflags);
}
#define KMALLOC_MIN_SHIFT (5)
#define KMALLOC_MIN_SIZE (1 << KMALLOC_TRACK_HASH_SHIFT)
#define KMALLOC_MIN_MASK (KMALLOC_MIN_SIZE - 1)
/* Actual low-level allocation routines */
static void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
{
struct kmalloc_header *chunk_iter;
struct kmalloc_header *chunk = NULL;
int npages;
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
/* KMALLOC_MIN_SIZE bytes aligned size. */
if (size & KMALLOC_MIN_MASK) {
size = ((size + KMALLOC_MIN_SIZE - 1) & ~(KMALLOC_MIN_MASK));
}
chunk = NULL;
/* Find a chunk that is big enough */
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
if (chunk_iter->size >= size) {
chunk = chunk_iter;
break;
}
}
split_and_return:
/* Did we find one? */
if (chunk) {
/* Do we need to split it? Only if there is enough space for
* another header and some actual content */
if (chunk->size > (size + sizeof(struct kmalloc_header))) {
struct kmalloc_header *leftover;
leftover = (struct kmalloc_header *)
((void *)chunk + sizeof(struct kmalloc_header) + size);
___kmalloc_init_chunk(leftover,
(chunk->size - size - sizeof(struct kmalloc_header)));
list_add(&leftover->list, &chunk->list);
chunk->size = size;
}
list_del(&chunk->list);
cpu_restore_interrupt(kmalloc_irq_flags);
return ((void *)chunk + sizeof(struct kmalloc_header));
}
/* Allocate new memory and add it to free list */
npages = (size + sizeof(struct kmalloc_header) + (PAGE_SIZE - 1))
>> PAGE_SHIFT;
chunk = ihk_mc_alloc_pages(npages, flag);
if (!chunk) {
cpu_restore_interrupt(kmalloc_irq_flags);
return NULL;
}
u = (size + sizeof(*h) - 1) / sizeof(*h);
___kmalloc_init_chunk(chunk,
(npages * PAGE_SIZE - sizeof(struct kmalloc_header)));
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
flags = ihk_mc_spinlock_lock(&v->free_list_lock);
prev = h;
h = h->next;
while (1) {
if (h == &v->free_list) {
req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
h = allocate_pages(req_page, flag);
if(h == NULL) {
kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag);
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return NULL;
}
h->check = 0x5a5a5a5a;
prev->next = h;
h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2;
/* Guard entry */
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->next = &v->free_list;
p->size = 0;
h->next = p;
}
if (h->size >= u) {
if (h->size == u || h->size == u + 1) {
prev->next = h->next;
h->cpu_id = ihk_mc_get_processor_id();
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return h + 1;
} else { /* Divide */
h->size -= u + 1;
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->size = u;
p->cpu_id = ihk_mc_get_processor_id();
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return p + 1;
}
}
prev = h;
h = h->next;
}
goto split_and_return;
}
void ___kfree(void *ptr)
static void ___kfree(void *ptr)
{
struct malloc_header *p = (struct malloc_header *)ptr;
struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id);
struct malloc_header *h = &v->free_list;
int combined = 0;
unsigned long flags;
struct kmalloc_header *chunk =
(struct kmalloc_header*)(ptr - sizeof(struct kmalloc_header));
unsigned long kmalloc_irq_flags = cpu_disable_interrupt_save();
flags = ihk_mc_spinlock_lock(&v->free_list_lock);
h = h->next;
while ((p < h || p > h->next) && h != &v->free_list) {
h = h->next;
/* Sanity check */
if (chunk->front_magic != 0x5c5c5c5c || chunk->end_magic != 0x6d6d6d6d) {
kprintf("%s: memory corruption at address 0x%p\n", __FUNCTION__, ptr);
panic("panic");
}
if (h + h->size + 1 == p && h->size != 0) {
combined = 1;
h->size += p->size + 1;
h->check = 0x5a5a5a5a;
/* Does this chunk belong to this CPU? */
if (chunk->cpu_id == ihk_mc_get_processor_id()) {
___kmalloc_insert_chunk(&cpu_local_var(free_list), chunk);
___kmalloc_consolidate_list(&cpu_local_var(free_list));
}
if (h->next == p + p->size + 1 && h->next->size != 0) {
if (combined) {
h->check = 0x5a5a5a5a;
h->size += h->next->size + 1;
h->next = h->next->next;
} else {
p->check = 0x5a5a5a5a;
p->size += h->next->size + 1;
p->next = h->next->next;
h->next = p;
}
} else if (!combined) {
p->next = h->next;
h->next = p;
else {
struct cpu_local_var *v = get_cpu_local_var(chunk->cpu_id);
unsigned long irqflags;
irqflags = ihk_mc_spinlock_lock(&v->remote_free_list_lock);
list_add(&chunk->list, &v->remote_free_list);
ihk_mc_spinlock_unlock(&v->remote_free_list_lock, irqflags);
}
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
cpu_restore_interrupt(kmalloc_irq_flags);
}
void print_free_list(void)
void ___kmalloc_print_free_list(struct list_head *list)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list;
struct kmalloc_header *chunk_iter;
unsigned long irqflags = kprintf_lock();
h = h->next;
kprintf("free_list : \n");
while (h != &v->free_list) {
kprintf(" %p : %p, %d ->\n", h, h->next, h->size);
h = h->next;
__kprintf("%s: [ \n", __FUNCTION__);
list_for_each_entry(chunk_iter, &cpu_local_var(free_list), list) {
__kprintf("%s: 0x%lx:%d (VA PFN: %lu, off: %lu)\n", __FUNCTION__,
(unsigned long)chunk_iter,
chunk_iter->size,
(unsigned long)chunk_iter >> PAGE_SHIFT,
(unsigned long)chunk_iter % PAGE_SIZE);
}
kprintf("\n");
__kprintf("%s: ] \n", __FUNCTION__);
kprintf_unlock(irqflags);
}

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More