Compare commits

...

416 Commits
0.4.0 ... 1.1.0

Author SHA1 Message Date
d90900b6e6 Make executor code include executer/config.h
Make the code "executer/kernel/mcctrl/arch/x86_64/archdeps.c"
to include "executer/config.h" instead of
non-existent "executer/kernel/mcctrl/config.h".
2016-06-09 18:40:39 +09:00
6d9a88e9f4 binfmt_mcexec: support post-K specification 2016-06-08 09:53:39 +09:00
d0ee60f9e3 mcoverlayfs: supported only Linux kernel 4.0 2016-06-03 18:36:55 +09:00
14ec92518e KVM support: detect KVM and avoid touching unimplemented MSRs 2016-05-26 01:11:08 +09:00
435e2bdeb4 support for Linux 4.6: use get_user_pages_remote() 2016-05-24 09:39:04 +09:00
f06d8041e3 don't send SIGCONT when sending SIGSTOP derived from PTRACE_ATTACH
refs #747
2016-05-19 10:54:12 +09:00
9b35eaca42 remote_flush_tlb_cpumask() dead locking
refs #728
2016-05-10 14:02:25 +09:00
130b1f4327 update PAPI support. other process and child process monitoring. 2016-04-26 19:01:47 +09:00
921280f85c Docker support: use task_XX_vnr() functions for accessing correct namespace 2016-04-21 09:59:49 -07:00
d4a0b32f06 support large pages 2016-04-21 23:22:55 +09:00
b3bec32e99 update_process_page_table: refactor 2016-04-21 23:22:55 +09:00
2048980820 remove ihk_mc_pt_alloc_range() 2016-04-21 23:22:54 +09:00
176f6d23a9 ihk_mc_pt_virt_to_pagemap: refactor 2016-04-21 23:22:54 +09:00
328175547f Revert "fix REQ-37: remap_one_page: remove to check page size"
This reverts commit 6790126a23.

- reverted commit should remove a 'pgsize' check in remap_one_page()
  instead of a 'pgsize' check in pte_make_fileoff().
- In IA-32e, PTE format varies with page size. Therefore 'pgsize'
  parameter of pte_make_fileoff() is preferable.
2016-04-21 23:22:54 +09:00
e2e0fad849 arch_clear_host_user_space: set zero to args[2]
to avoid duplicated per_proc_list entry.
2016-04-21 23:22:54 +09:00
397bf3f4a6 wait_zombie: don't wait attached process
refs #726
2016-04-21 20:28:36 +09:00
aa77228453 resupport ptrace(PTRACE_ATTACH)
refs #733
2016-04-21 20:13:27 +09:00
82cb8f95ed update PAPI support. 2016-04-18 13:07:45 +09:00
3f2b4e7282 do_wait: unlink child from children_list if child terminated
refs #724
2016-04-14 10:25:12 +09:00
d6784bb4a5 update auto-generated files 2016-04-11 22:25:53 +09:00
1bb948f43b hwloc support 2016-04-11 22:25:27 +09:00
2a1823d52c vdso: set enable bit of pvti_msr 2016-04-11 22:20:39 +09:00
89943dc5ba vdso: set physical address at pvti_msr 2016-04-11 22:20:39 +09:00
fceb02a44a vdso: add zero clear for pvti 2016-04-11 22:20:38 +09:00
7298d8e179 vdso: correct pvti array element type
struct pvclock_vsyscall_time_info <-- struct pvclock_vcpu_time_info
2016-04-11 22:20:38 +09:00
6f32544dde vdso: add static cast 2016-04-11 22:20:38 +09:00
10d248b3cc mcexec: include config.h 2016-04-11 22:20:38 +09:00
fb32120659 make mcoverlayfs optional (default: enabled) 2016-04-02 15:43:35 -04:00
73de203c16 update auto-generated files 2016-03-28 22:57:45 +09:00
41bb2ab5e6 support vdso which borrows clocksource from linux 2016-03-28 22:57:44 +09:00
a587c8f5e5 x86: encode cpu# in IA32_TSC_AUX and size of GDTe#15 2016-03-28 22:57:44 +09:00
0c53a5ca35 add NOPHYS which means no physical memory 2016-03-28 22:57:44 +09:00
c760a01a79 add pte_get_attr() 2016-03-28 22:57:44 +09:00
a2c29e8abf correct the value of tod_data.origin
tod_data.origin should hold a time when TSC is zero.
2016-03-28 22:57:39 +09:00
18add6a9bd shmctl(IPC_RMID): fix wrong owner/creator checking (revised)
Don't check owner/creator of the segment in case of superuser.
2016-03-28 16:02:24 +09:00
a083e6c2bf Revert "shmctl(IPC_RMID): fix wrong owner/creator checking"
This reverts commit 8b5b075f4c.

The reverted commit modifies IPC_SET instead of IPC_RMID.
2016-03-28 16:00:39 +09:00
a2548f5421 Revert "fix REQ-42"
This reverts commit 4a0682bbc1.

The reverted commit appears to be wrong, for example:
- arch_range_check()'s arguments and parameters are mismatch.
- arch_range_check() implementation is not checking range.

Conflicts:
	kernel/syscall.c
2016-03-28 13:51:57 +09:00
6790126a23 fix REQ-37: remap_one_page: remove to check page size 2016-03-27 14:05:00 +09:00
1195549f41 fix REQ-19: some syscalls change how to access user space 2016-03-27 11:43:53 +09:00
b0096a2740 fix REQ-51 2016-03-26 12:23:51 +09:00
a11479eba8 fix REQ-48 2016-03-25 13:05:53 +09:00
12eaea401e fix REQ-46 2016-03-25 12:59:18 +09:00
31595b7409 fix REQ-43 2016-03-25 12:57:31 +09:00
4a0682bbc1 fix REQ-42 2016-03-24 19:14:50 +09:00
932a287437 fix REQ-40 2016-03-24 13:46:13 +09:00
670741ae40 fix REQ-39 2016-03-24 13:45:15 +09:00
70b27e06ff eclair: change default kernel to ./mckernel.img 2016-03-23 20:00:57 +09:00
4c38ddb623 update auto-generated files 2016-03-23 20:00:57 +09:00
6f00ddced6 move eclair from ihk repository 2016-03-23 20:00:57 +09:00
c0eecd63c9 update auto-generated files 2016-03-23 20:00:57 +09:00
1fd0b03e78 move config.h.in
from executer/kernel/mcctrl/config.h.in
to   executer/config.h.in
2016-03-23 20:00:57 +09:00
6c59de9300 expand AC_PROT_CC only once 2016-03-23 20:00:57 +09:00
b1309a5d53 map PIE at map_end instead of at user_start 2016-03-23 19:14:28 +09:00
489cd6d1a2 refactor prepare_process_ranges_args_envs() 2016-03-23 19:14:28 +09:00
c9cc4330c8 mincore: take into account pages cached in memobj 2016-03-23 19:14:28 +09:00
604f846cd2 mincore: check [start..start+len) is in user region 2016-03-23 19:14:28 +09:00
e939cf6862 mincore: cosmetic changes 2016-03-23 19:14:28 +09:00
72f2e5ebe0 shmobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
bd7dddd415 fileobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
fbd9dc878b memobj: add lookup_page method 2016-03-23 19:14:28 +09:00
d6c51ff997 treat memory devices as regular files,
to enable processes to mmap() /dev/zero
2016-03-23 19:14:27 +09:00
86ac51157c add error checks to shmctl(SHM_UNLOCK) 2016-03-23 19:14:27 +09:00
b73fa2b972 add error checks to shmctl(SHM_LOCK) 2016-03-23 19:14:27 +09:00
798f69bceb add has_cap_ipc_lock() 2016-03-23 19:14:27 +09:00
e8be52a1ff shm: trace the amount of locked segment per user 2016-03-23 19:14:27 +09:00
8b5b075f4c shmctl(IPC_RMID): fix wrong owner/creator checking
Don't check owner/creator of the segment in case of superuser.
2016-03-23 19:14:27 +09:00
b214fc278a add has_cap_sys_admin() 2016-03-23 19:14:27 +09:00
b3ae7f46bd add rlim_t (a type of rlim_cur and rlim_max) 2016-03-23 19:14:27 +09:00
48167d3223 shmget: add "shmflg" checks for SHM_HUGE* 2016-03-23 19:14:27 +09:00
d65135c040 move sys_shmget() into arch-dependent code 2016-03-23 19:14:27 +09:00
1761acc4c3 eliminate geteuid(), getegid() and getpid() 2016-03-23 19:04:32 +09:00
d4d93df032 mmap: add "flags" checks for MAP_HUGE* 2016-03-23 19:04:32 +09:00
261bddb999 add a member pgshift into struct vm_range
pgshift indicates a page size in the range.
2016-03-23 19:04:32 +09:00
1a3bc851af mprotect: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
15f572ef9c mmap: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
81690c5b5a mmap: cosmetic changes 2016-03-23 19:04:32 +09:00
832c0f9afd refactor copy_user_ranges() 2016-03-23 19:04:32 +09:00
f92cac7751 add type casting to the argument of getlong_user() 2016-03-23 19:04:32 +09:00
e74eb1dd51 add some prototypes to <memory.h> 2016-03-23 19:04:32 +09:00
8f7b9072ea refactor some copyin/copyout functions
- copy_from_user()
- getlong_user()
- getint_user()
- copy_to_user()
- setlong_user()
- setint_user()
2016-03-23 19:04:32 +09:00
4595aa3079 pte_visitor_t(): change "pgsize" into "pgshift" 2016-03-23 19:04:32 +09:00
807d294ac4 signalfd4: fix initialize 2016-06-03 20:58:02 +09:00
c947dd0d49 sysfs: support /sys/devices/system/cpu/online 2016-03-22 20:25:34 +09:00
d192e6c0fe modify PAPI support 2016-03-22 15:52:59 +09:00
7dbbcb362f add PAPI support 2016-03-22 15:27:19 +09:00
593cf98015 add ACSL annotation 2016-03-16 15:42:32 +09:00
8dd9f5ef3f support profil 2016-03-12 16:47:19 +09:00
0eaf058a4f mcexec: -lrt to Makefile.in for supporting clock_gettime() on SUSE 2016-03-12 05:24:14 +09:00
1aac2c8e23 add CPU timer initialization (refs #402)
There is no actual initialization in x86 now.
The initialization rely on hardware reset and Linux initialization.
2016-03-11 19:20:37 +09:00
70e8dd7979 remove initialization of TSC (refs #362) 2016-03-11 19:17:29 +09:00
eb0700359b fix REQ-36 2016-03-10 10:33:38 +09:00
3f16a9443e ptrace_report_signal: save debug regs before to send SIGCHLD to tracer 2016-03-09 22:29:51 +09:00
bf0cf0a346 fix REQ-31 2016-03-08 15:19:03 +09:00
14b868907b fix REQ-27 2016-03-07 18:52:08 +09:00
dbc778e4fa support getrusage (work in progress) 2016-03-07 17:06:44 +09:00
7fac03d4de sysfs: support /sys/devices/system/cpu/offline,online,possible,present 2016-03-04 13:48:06 +09:00
26c0180374 rwlock_reader_lock: fix lock list jammed up 2016-03-03 22:47:48 +09:00
8ebb3a4231 schedule: migration free last thread if terminated 2016-03-03 22:44:44 +09:00
f1f1ba9c8c mcs_rwlock_reader_lock: temporary fix 2016-03-01 19:11:42 +09:00
6ce00b5f0f sysfs: samples of snooping ops 2016-02-29 19:59:04 +09:00
4ec0e02a89 sysfs: add snooping ops 2016-02-29 19:23:01 +09:00
8f9192ac36 mcctrl: workaround for out-of-tree build (2/2)
- update auto-generated file
2016-02-29 19:18:08 +09:00
80ce123ab6 mcctrl: workaround for out-of-tree build (1/2) 2016-02-29 19:18:08 +09:00
1dc8513cd3 fix REQ-20 2016-02-26 16:18:30 +09:00
b0054643c0 REQ-18 2016-02-26 16:17:23 +09:00
972ff73ecf mcexec: fix readlink
refs #692
2016-02-25 16:08:42 +09:00
1f8a859b47 mcctrl: update auto-generated files 2016-02-24 21:34:48 +09:00
2601d8a36f mcctrl: use zap_page_range() instead of madvise() 2016-02-24 21:34:48 +09:00
a713c2fcaa fix REQ-16 2016-02-24 20:58:07 +09:00
c4c5e435cc fix REQ-12 2016-02-24 20:57:45 +09:00
853b56c784 mcreboot-smp-x86.sh: add mount to ceate /tmp/mcos/linux_proc from /proc 2016-02-24 19:24:37 +09:00
863a5c5e5f fix REQ-2, REQ-6, REQ-8 2016-02-23 16:32:17 +09:00
ebce1cb031 Merge branch 'master' of postpeta.pccluster.org:mckernel 2016-02-22 13:34:00 +09:00
fff7744907 mcklogd support 2016-02-22 13:32:20 +09:00
27c3ed7e96 remove debug print 2016-02-21 15:17:42 +09:00
e2b28da32f signal handler support gdb stepi command 2016-02-21 14:55:34 +09:00
2c50b716fd support setitimer/getitimer 2016-02-19 15:25:05 +09:00
307b2b8da5 clock_gettime: support clock_id CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID 2016-02-18 17:43:13 +09:00
eba2be8a35 support times 2016-02-18 13:14:18 +09:00
a997af71be support tkill
refs #664
2016-02-17 12:48:12 +09:00
e7c37b8000 mcreboot-smp-x86.sh: fix Failed to mount /sys/devices/virtual/mcos/mcos0/sys 2016-02-16 16:05:40 +09:00
8c40f94aa8 /proc/<PID>/mem: support read/write 2016-02-16 13:21:29 +09:00
da13bd408a mcexec: add to initialize some structures (REQ-56)
refs #718
2016-02-15 18:20:58 +09:00
c328d26b8d procfs(/proc/<PID>/task/<TID>/stat): fix memory corruption
refs #722
2016-02-15 15:10:00 +09:00
6cda6792a9 process_msg_init_acked: don't use PA 2016-02-14 22:47:52 +09:00
2d3fda1d0b flatten_strings: fix align (REQ-1) 2016-02-14 22:36:58 +09:00
5d43c135db procfs: (temporary fix) unsupported files are closed 2016-02-10 17:10:54 +09:00
a866192db7 refactoring /proc 2016-02-10 08:11:02 +09:00
c0cc6ac6db Add skeleton for perf_event_open. 2016-02-09 14:54:53 +09:00
14c5bc08c2 mcexec: check Linux version from actual kernel tree instead of system wide include 2016-02-09 14:07:08 +09:00
7f01d273d0 mcctrl: fix out-of-tree build (not finding config.h) 2016-02-09 12:45:58 +09:00
137e0a799c mcexec: unshare and mount request through mcctrl 2016-02-08 16:27:03 +09:00
f214ff1b57 mcctrl: add MCEXEC_UP_SYS_MOUNT, MCEXEC_UP_SYS_UNSHARE 2016-02-08 16:00:52 +09:00
0ce698eb1f mcexec: support for /sys mounted by mcoverlayfs 2016-02-08 11:36:03 +09:00
e601248bdc procfs: fix mcos%d/PID/auxv size 2016-02-08 09:38:27 +09:00
d8eeab9b89 mcoverlayfs: enable out of tree compilation 2016-02-01 00:35:53 +09:00
fdf031ac16 procfs: chown procfs entries (temporary hack)
refs #651
refs #699
2016-01-28 16:29:46 +09:00
1ffe740153 sysfs sample 2016-01-26 18:08:25 +09:00
72968d613e support sysfs interface for mcctrl 2016-01-26 18:08:25 +09:00
2e98f875c3 sysfs: attempt to remove empty directories only 2016-01-26 18:08:25 +09:00
a6cb9a6b93 sysfs: lookup_i(): refactoring 2016-01-26 18:08:25 +09:00
da0a91b9f7 mcctrl: denote full path in /proc/PID/exe 2016-01-26 16:21:52 +09:00
f093786bec x86: populating PML4e and PDPTe is now lock-free 2016-01-25 09:17:06 +09:00
368f155328 sigaction: support SA_NODEFER
refs #698
2016-01-21 18:48:10 +09:00
425f920013 mcctrl: delete procfs entries recursively to avoid leaking 2016-01-21 18:15:59 +09:00
dbddf37579 set termsig to mcexec spawned process 2016-01-21 12:08:47 +09:00
fa7a5ccd11 support /proc/self/exe (needed for GDB to attach to an existing process) 2016-01-19 18:23:02 +09:00
172bf0a389 sched_setaffinity: add permission check 2016-01-15 12:05:18 +09:00
9bafd166e3 futex: support FUTEX_CLOCK_REALTIME 2016-01-14 16:18:49 +09:00
2e31b8abd1 clock_gettime: clock_id != CLOCK_REALTIME -> offload to linux 2016-01-13 14:04:06 +09:00
a42ee00101 NR_execve: initialize local variable 'shell'
refs #696
2016-01-13 11:16:19 +09:00
f6935b0869 ptrace_setsiginfo: update recieved siginfo 2016-01-11 17:37:29 +09:00
03a7763a5e ptrace_conf: set received siginfo to default siginfo 2016-01-11 17:10:30 +09:00
3a2f7b0106 clone: support CLONE_PARENT 2016-01-11 16:49:02 +09:00
2819ec2197 fix extra copy which might cause page faults 2016-01-06 21:12:57 +09:00
f7d81a9281 fix typo 2016-01-06 21:12:57 +09:00
914faf042d add missing kfree() for channel lookup table 2016-01-06 21:12:57 +09:00
75c6a94839 delete struct member 'type' from address_space structure 2016-01-06 20:17:00 +09:00
f7b5b48266 support x2apic 2016-01-06 13:53:02 +09:00
f9bd83c726 ptrace: fix PTRACE_GETREGSET, PTRACE_SETREGSET bug
refs #608
2015-12-28 19:45:50 +09:00
edc275ce4f delete free_list_lock 2015-12-28 11:31:42 +09:00
d00ea61d1a ptrace_wakeup_sig: fix thread lock 2015-12-28 10:33:07 +09:00
01117e92c9 append file path to symlink if link path is absolute
refs #643
2015-12-25 15:50:39 +09:00
d477096cb0 getrlimit, setrlimit: offload to linux when an unknown parameter was specified
refs #660
2015-12-25 15:35:33 +09:00
f44ddfa3b3 support sigtimedwait 2015-12-24 12:35:45 +09:00
e0acd254b1 do_process_vm_read_writev: use process hash for remote process search 2015-12-22 09:47:00 +09:00
d0507f7e9f process_read/write_vm(): fix LTP bugs 2015-12-18 15:58:51 +09:00
0f8b2aba22 reset signal handlers when execve called 2015-12-18 12:46:53 +09:00
7e5c7445e2 fix ptrace_detach bug
refs #662
2015-12-16 17:41:57 +09:00
a055fb525d sysfs sample 2015-12-16 13:42:30 +09:00
8cb72df663 support McKernel's sysfs tree 2015-12-16 13:42:30 +09:00
e805249651 add strrchr() 2015-12-16 13:42:30 +09:00
06a7889e1f chown root mcexec 2015-12-15 16:22:14 +09:00
20deed09f0 mcexec: support for /proc mounted by mcoverlayfs 2015-12-14 14:47:05 +09:00
bb81f84709 support PIE executable for PVAS 2015-12-14 11:05:28 +09:00
5c1dad1660 GDB: async-shell.exp
refs #650
2015-11-26 17:07:13 +09:00
7f2220b8e9 set '\0' termination to readlink result.
refs #643
2015-11-26 16:58:15 +09:00
65dda3f24e mcoverlayfs: support mount options(nocopyupw, nofscheck) 2015-11-25 15:34:58 +09:00
544971d665 modify for PVAS 2015-11-25 14:27:20 +09:00
dbddab4356 mcoverlayfs: add overlayfs of the original(kernel 4.0.9) 2015-11-25 13:23:49 +09:00
12eb8a9bb0 mcctrl: move mcctrl to executer/kernel/mcctrl 2015-11-24 15:42:04 +09:00
828a3ea57a futex(): support for cross address-space futexes 2015-11-24 14:58:04 +09:00
eb6de9d1de delete debug code 2015-11-13 15:10:14 +09:00
42c8ef6539 do_fork(): fix CLONE_PARENT_SETTID bug 2015-11-13 12:46:09 +09:00
780d4fc29b futex_wait(): support for FUTEX_CLOCK_REALTIME 2015-11-13 12:46:02 +09:00
94fcc5bb9a futex_wait: add to check signal 2015-11-12 09:38:36 +09:00
e822fc47dd fix dead locking when kill subthreads 2015-11-11 23:03:43 +09:00
26492a2895 vsyscall_gettimeofday: make timeval from TSC 2015-11-11 19:45:14 +09:00
1a5ff7f535 gettimeofday: gather variables into new struct 2015-11-11 18:31:33 +09:00
4c181d7fc0 smp-x86: add supports for dump analyzer 2015-11-09 16:06:55 +09:00
be78eb752e time_init: fix zero divide on KVM 2015-11-06 19:31:42 +09:00
0ad7c8ac50 nanosleep: fix arguments to be delegated 2015-11-06 19:31:42 +09:00
e9458a6cd3 fix ptrace02 failed 2015-10-30 16:59:03 +09:00
9e3b0b5866 bug fix 'GDB: missing parent-child relationship'
refs #641
2015-10-30 15:06:27 +09:00
0eaa27291a thread: move clear_child_tid, etc. to main structure 2015-10-29 11:01:27 +09:00
0b07dd1b79 support madvise(MADV_REMOVE) partially
This MADV_REMOVE works with a mapping which is
- created with shmat() and
- not sharing memobj with other mappings.
2015-10-28 18:41:28 +09:00
c25f8c7a39 support settimeofday() 2015-10-27 19:21:50 +09:00
9e53ae20d4 add memory barriers
- rmb()
- wmb()
2015-10-27 19:21:50 +09:00
09c9ee58d1 add 64bit atomic operations
- ihk_atomic64_t
- IHK_ATOMIC64_INIT()
- ihk_atomic64_read()
- ihk_atomic64_inc()
2015-10-27 19:21:50 +09:00
153a59a6f4 gettimeofday: avoid per-cpu data in calculation
Because it is difficult to safely update per-cpu data of other cpus in
settimeofday().
2015-10-27 19:21:50 +09:00
cad72a8562 when SIGXCPU or SIGXFSZ, set coredump bit to exit status 2015-10-22 20:57:37 +09:00
343bfbd30a rename back status field 2015-10-22 20:26:50 +09:00
4e4f1208f7 delete unused member 2015-10-19 20:12:26 +09:00
a325a78866 refactoring to send signal 2015-10-15 17:10:02 +09:00
6ae99454da delete debug print 2015-10-15 06:51:41 +09:00
04e193de13 refactoring process structures 2015-10-13 23:04:08 +09:00
2ca46fabfd support reader/writer lock 2015-10-02 14:05:10 +09:00
5b737b499d fix cmpxchgq operand 2015-10-02 14:04:05 +09:00
cb4f3a4d65 take into account args/envs' offset in page
- prepare_process_ranges_args_envs()
2015-10-01 21:08:42 +09:00
51789fcd38 initialize idle_vm for page faluts 2015-10-01 21:08:35 +09:00
9f50c5dc3a mcexec_wait_syscall: handle request even if signaled (reworked) 2015-09-29 19:53:40 +09:00
cd905f7ad1 Revert "mcexec_wait_syscall: handle request even if signaled"
This reverts commit d862f345be.
2015-09-29 19:52:36 +09:00
79266f6b97 x86_issue_ipi: keep interrupt disabled while issuing IPI 2015-09-29 19:10:01 +09:00
a666b69c2c make x86_issue_ipi() call wait_icr_idle() 2015-09-29 19:10:01 +09:00
47e8552eba move wait_icr_idle() before x86_issue_ipi() 2015-09-29 19:10:00 +09:00
8dd9175411 schedule: fix null pointer dereference 2015-09-29 19:10:00 +09:00
f08e0c0054 guess whether MSR_PLATFORM_INFO exists or not 2015-09-29 19:10:00 +09:00
d862f345be mcexec_wait_syscall: handle request even if signaled 2015-09-24 21:35:30 +09:00
a14768c49a kmalloc: fix missing unlock on out-of-memory path 2015-09-18 21:26:15 +09:00
56e57775e7 clone: fix error message 2015-09-18 21:26:15 +09:00
b3b752ba41 nanosleep: use copy_from_user instead of direct access 2015-09-17 21:46:32 +09:00
7b32f2f73b nanosleep: fix tscs_rem underflow issue 2015-09-17 21:46:26 +09:00
ea5a1a8693 nanosleep: update *rem whenever signaled 2015-09-17 21:44:49 +09:00
92f8fb2b2b nanosleep: use copy_to_user instead of direct access 2015-09-17 21:44:49 +09:00
a3e440414d nanosleep: cosmetic change 2015-09-17 21:44:49 +09:00
10ba03ccea mcreboot-smp-x86.sh: fix querying free irq 2015-09-17 13:19:07 +09:00
ccb7c30a05 page_fault_handler(): reenable preempt after failed PF when process is exiting 2015-09-17 10:05:32 +09:00
7dfeb8e7ce create demand-paging mapping in case of MAP_SHARED
On current McKernel, only mappings for demand paging can be shared.
Therefore, if MAP_SHARED and MAP_ANONYMOUS are specified and
anon_on_demand is disabled, then mmap(2) should create a mapping which
is for demand paging and is entirely populated with physical pages.
2015-09-16 21:38:00 +09:00
b1b706453f vsyscall: send SIGSEGV to the caller if syscall fails
On CentOS 7 (RHEL 7?), "errno" isn't set when vsyscall_gettimeofday
fails. So, in such case, vsyscall_gettimeofday send SIGSEGV to the
caller to report failure of gettimeofday operation.
2015-09-16 21:37:11 +09:00
bd5708286d make sys_gettimeofday() use copy_to_user() 2015-09-16 21:26:32 +09:00
c8a13cf213 make gettimeofday ignore NULL parameter 2015-09-16 21:26:24 +09:00
5ad0a03d18 make gettimeofday handle second parameter (timezone) 2015-09-16 21:25:29 +09:00
3819eec03f cosmetic changes
- sys_gettimeofday()
2015-09-16 21:13:12 +09:00
40b8587a8a schedule(): sync CPU_FLAG_NEED_RESCHED flag with clone and migrate 2015-09-16 19:22:40 +09:00
e7b1115572 mcreboot-smp-x86.sh: introduction of ihk_ikc_irq_core argument 2015-09-14 17:30:25 +09:00
e1a01803d0 disable demand paging on ANONYMOUS mappings unless anon_on_demand kernel argument is passed 2015-09-14 17:26:37 +09:00
69f4b0e1ad gettimeofday()/nanosleep(): check arguments, return on pending signal 2015-09-14 17:05:30 +09:00
0909a5bed5 tracee context is broken when tracee call execve 2015-09-03 10:05:25 +09:00
9dd224385e When SIGSEGV occurred on a tracee process, a tracee process freezes. 2015-09-01 17:37:56 +09:00
4176c59fd3 using d_path for solution to file path. 2015-08-28 13:01:34 +09:00
afeee5432f When envp is NULL, execve is delayed. 2015-08-28 13:00:45 +09:00
9ae5bcf46e gettimeofday(): an implementation based on CPU invariant TSC support 2015-08-24 23:53:56 +02:00
b8f166e608 mcreboot-smp-x86.sh: handle resource allocation after unloading; mcstop+release-smp-x86.sh 2015-08-22 18:55:53 +09:00
c85a9b99e1 a couple of cosmetic changes of debug messages 2015-08-22 18:53:14 +09:00
7c816a6b73 an implementation of the Mellor-Crummey Scott (MCS) lock 2015-08-20 15:26:52 +09:00
5a0cd3f53f ptrace_detach when exiting
refs #590
2015-08-18 18:03:09 +09:00
9fa62adfe7 execve(): stay compliant with locked context switching 2015-08-10 14:18:11 +09:00
f0ab8ec89a sched_request_migrate(): change CPU flags atomically 2015-08-10 12:45:59 +09:00
f4cc82578d check_need_resched(): no thread migration in IRQ context 2015-08-10 12:43:35 +09:00
9ba40dc0ff schedule(): hold runq lock for the entire duration of context switching
releasing the runq lock after loading page tables but before the actual
context switch can leave execution in an inconsistent if the current
process is descheduled from an IRQ between these two steps.
this patch holds the runq lock with IRQs disabled and makes the context
switch a single atomic operation.
2015-08-10 12:37:12 +09:00
8d6c97ea5c schedule(): disable auto thread migration 2015-08-07 16:07:31 +09:00
386f59000a mcreboot-smp-x86.sh.in: grant real user rw permission on /dev/mcos* 2015-08-07 13:33:44 +09:00
215cd370a1 ap_init(): clean up AP boot kernel messages 2015-08-07 10:57:59 +09:00
0a0e2c04a0 support for dynamically toggling time sharing when CPU is oversubscribed 2015-08-07 08:51:50 +09:00
aa191b87d3 schedule(): use XSAVE/XRSTOR and swap floating point registers in context switch 2015-08-07 08:41:00 +09:00
d5c243571f cpu_clear_and_set(): atomic CPU mask update in migration code 2015-08-06 10:49:55 +09:00
328e69a335 schedule(): do not preempt while holding spinlocks or while in offloaded syscall 2015-08-06 10:36:13 +09:00
b77755d0f7 obtain_clone_cpuid(): always start from CPU 0 and fill in cores linearily 2015-07-28 20:20:47 +09:00
d7bae14707 TEMPORARY: schedule(): move threads when core is explicitly oversubscribed 2015-07-28 20:12:58 +09:00
4e58d08f5c schedule_timeout(): give a chance to other process in spin sleep if CPU core is oversubscribed 2015-07-28 20:06:56 +09:00
9b1e691588 fix thread migration code (i.e., sched_setaffinity())
- moved migration code into idle() process and updated schedule() to detect
  when a thread has moved to another CPU in order to avoid doing housekeeping
  on behalf of the original one
- start CPU head from core 0
- keeps track of nested interrupts
2015-07-24 20:09:17 +09:00
3988b0fc61 keep track of IRQ context and don't do thread migration there 2015-07-23 16:56:58 +09:00
54eb345847 settid(): prevent modifying tid after thread migration 2015-07-23 16:51:24 +09:00
bbe7aef95b fix calling do_signal (argument lacked) 2015-07-17 10:18:43 +09:00
1ff4cf68c2 support SA_RESTART flag and restart syscall 2015-07-16 16:33:14 +09:00
1bc84d3feb modify to copy credentials 2015-07-13 15:29:26 +09:00
f7d78c8b7d sched_getaffinity(): return EINVAL for 0 lenght request (fixes LTP sched_getaffinity01) 2015-07-10 11:00:43 +09:00
7647c99cc2 do_migrate(): disable IRQ while holding migq_lock to avoid deadlocking with reschedule interrupts 2015-07-09 15:23:28 +09:00
43a774fbfc sched_setaffinity(): undo target core change, avoid abort on length mismatch 2015-07-09 11:00:26 +09:00
a029bcac37 mcreboot-smp-x86: find unused IRQ line and pass start vector to ihk_smp_x86.ko 2015-07-07 09:07:16 +09:00
bd913c503b sched_setaffinity(): find an actual target core 2015-07-03 11:59:52 +09:00
e838affde8 fix to compile error on CentOS 7 2015-07-02 17:08:35 +09:00
59ee251e1c fix /proc/pid/mem, /proc/pid/status, /proc/pid/cmdline 2015-07-02 00:22:35 +09:00
fa79db3bcc fix out of tree build 2015-07-01 23:58:50 +09:00
b7c5cba361 fix to compile on CentOS 6 2015-07-01 23:57:40 +09:00
382614ddae pstate: use MSR_NHM_TURBO_RATIO_LIMIT as maximum single-core turbo ratio 2015-07-01 22:18:38 +09:00
aa959c6b34 temporary fix for CentOS 6.x 2015-06-30 18:19:53 +09:00
aabc3d386d support a function to execute mcexec automatically. 2015-06-30 17:47:01 +09:00
4ebe778ede vm->exiting: deal with exit_group() and concurrent page faults 2015-06-25 16:04:04 +09:00
fbb776e4fb cpu init: support for no_turbo kernel argument 2015-06-25 12:18:27 +09:00
41b85281a4 mcctrl: introduction of RUS page hash to handle page refcounts properly 2015-05-31 15:42:39 +09:00
5532e3c663 mcreboot script for new IHK SMP-x86 I/F 2015-05-26 14:41:28 +09:00
2af2b1205f temporary fix for setfsuid/setfsgid 2015-05-19 06:27:59 +09:00
7d5a68be1b add PID and GID to /proc/pid/status
add /proc/pid/cmdline

refs #445
refs #447
2015-05-18 17:45:37 +09:00
f4162dff52 some signals set siginfo.si_code 2015-04-14 15:11:36 +09:00
a0d909af75 add supports for dump analyzer 2015-03-31 12:59:53 +09:00
63669b7f71 support /proc/pid/status for LTP mmap14 2015-03-28 14:20:07 +09:00
4946964ed0 update copyright notices 2015-03-27 14:50:09 +09:00
5f19842a6a support for process_vm_readv()/process_vm_writev() 2015-03-25 19:44:56 +09:00
9271d5346d add ACSL annotation to cpu.c 2015-03-25 15:54:08 +09:00
7bba05cfa4 Revise use of iov_base in ptrace_read_regset() and ptrace_write_regset(). 2015-03-20 20:33:40 +09:00
c2a1f933e8 Set tid (instead of pid) for ptrace event message of
PTRACE_EVENT_{FORK,VFORK,CLONE,VFORKDONE}.
Specify 2nd argument as pid (instead of -1) of function findthread_and_lock(),
to find tracee process in ptrace subroutines.
(gdb testsuite gdb.base/watch_thread_num.exp)
2015-03-20 13:22:00 +09:00
055769254d implement mlockall()/munlockall() for LTP syscall 2015-03-19 16:46:31 +09:00
786ae83380 add arch-dependent mman.h 2015-03-19 16:36:57 +09:00
8c662c83be implement mincore(2) for LTP 2015-03-19 16:32:03 +09:00
4698bc40c2 implement System V shared memory for LTP syscalls 2015-03-19 16:21:18 +09:00
f5d935b703 support signalfd4 step1 2015-03-18 17:35:43 +09:00
d53865ac5f change to check sequence of kill syscall, check sig num zero after uid checking 2015-03-18 12:59:05 +09:00
8934eb91a4 kill syscall check uid 2015-03-17 15:04:36 +09:00
ed6d94a358 syscall slowdown when repeat fork/exit/wait (LTP fork13) 2015-03-11 16:09:59 +09:00
fa923da0e3 add host PTE cleaning to execve(). refs #377
This removes a cause of LTP gethostid01's wrong behavior.
2015-03-10 18:23:50 +09:00
1f8265efbc check _PAGE_PWT and _PAGE_PCD directly instead of _PAGE_CACHE_WC 2015-03-07 02:12:48 +09:00
b553de7435 supports PTRACE_GETREGSET, PTRACE_SETREGSET.
supports PTRACE_GETFPREGS, PTRACE_SETFPREGS.

refs #421
2015-03-06 19:18:32 +09:00
6a82412d64 modify procfs to read inactive thread's files
However, the following files can be read only if the corresponding
thread is in active.
- /proc/<PID>/mem
- /proc/<PID>/task/<TID>/mem

refs #371
2015-03-05 21:41:24 +09:00
fa29c34995 expand the size of kstack 12 KiB
When a procfs file belonging to a process which was in PS_TRACED status
was accessed, calling kprintf() from process_procfs_request() caused
stack overrun, and x86_cpu_local_variables was destroyed.
2015-03-05 20:30:33 +09:00
f84b5acf79 map entire buffer to read procfs
Reading data from procfs file more than 4096 byte caused a buffer
overrun in McKernel because the buffer was always mapped in McKernel
4096 byte regardless of actual buffer size.
2015-03-05 20:30:33 +09:00
8b24f60861 Combine range and memobj flags before arch_vrflag_to_ptattr() 2015-03-05 16:40:14 +09:00
f82bb284bb Make pager and devobj debug messages optional 2015-03-05 16:03:21 +09:00
bf12a5c45e Introduction of write-combined memory type mappings.
Introduction of VR_WRITE_COMBINED, PTATTR_WRITE_COMBINED and modification
to the memobj's get_page() interface so that Linux communicates back mapping
flags (such as write-combined).
2015-03-05 16:03:21 +09:00
ea5681232e x86 Page Attribute Table (PAT) MSR support.
Reconfigure PAT to permit write-combining memory type to be assigned
on a page-by-page basis. Changes PWT and PCD bit combinations in page
table entries so that they correspond to the following format:

  PAT
  |PCD
  ||PWT
  |||
  000 WB  Write Back (WB)
  001 WC  Write Combining (WC)
  010 UC- Uncached (UC-)
  011 UC  Uncacheable (UC)
2015-03-05 16:03:20 +09:00
e6011be1af create area for to save fp regs
refs #421
2015-03-05 12:18:46 +09:00
9946ccd6b1 pipe free fork is implemented (LTP fork09) 2015-03-04 17:40:58 +09:00
daec7de828 implement /proc/stat
only for sysconf(_SC_NPROCESSORS_ONLN).  This enables Intel OpenMP
runtime to arrange threads with regard for CPU topology.

refs #291
2015-03-04 15:46:53 +09:00
9ad48083aa make PTRACE_POKETEXT use patch_process_vm() 2015-03-04 12:04:54 +09:00
2eac58aab3 add patch_process_vm(). (in progress)
This function patches specified range of specified user space even if
the range is not writable.

refs #401
2015-03-04 12:00:51 +09:00
22d8d169b6 change copy-out routines
- restrict copy_to_user() to only current process.
- add write_process_vm() to write specified process space.
2015-03-04 11:29:16 +09:00
8db54c2637 make GPE on CPL0 cause panic 2015-03-04 11:29:16 +09:00
063fa963c3 change copy-in routines
- restrict copy_from_user() to only current process.
- add read_process_vm() to read specified process space.
2015-03-04 11:29:15 +09:00
a6488adcc1 change parameter type of ihk_mc_pt_virt_to_phys()
- add type qualifier 'const' to virtual address parameter.
  that is, change parameter 'virt' from       'void *'
                                     to 'const void *'
2015-03-04 11:29:15 +09:00
2239a6b09b modify page_fault_process()
- change its argument from 'struct process *'
                        to 'struct process_vm *'.
- change its name from 'page_fault_process()'
                    to 'page_fault_process_vm()'.
- allow to resolve a fault on the process_vm of another process.
2015-03-04 11:29:15 +09:00
8c179d506a support PTRACE_ARCH_PRCTL.
refs #420
2015-03-03 14:22:57 +09:00
377341ce5f change debug output in debug/int3 handler, for struct x86_user_context. 2015-03-03 14:06:30 +09:00
8caeba7cba support PTRACE_GETSIGINFO and PTRACE_SETSIGINFO
refs #422
2015-03-03 09:54:57 +09:00
1d2f5d9893 set is_gpr_valid to initial user context 2015-02-27 14:47:43 +09:00
e4f47df3c3 initialize pstate, turbo mode and power/performace bias MSR registers
MSR_IA32_MISC_ENABLE, MSR_IA32_PERF_CTL and MSR_IA32_ENERGY_PERF_BIAS
are responsible for performance settings, this change enables McKernel
to perform on par with Linux when running the fwq benchmark.
2015-02-27 11:29:11 +09:00
4751055ee4 make ptrace(2) use lookup_user_context() 2015-02-26 17:43:10 +09:00
305ebfed0e add lookup_user_context(). refs #420 2015-02-26 17:43:10 +09:00
b66b950129 add x86_sregs into x86_user_context
x86_sregs contains the registers which are included in user_regs_struct
but not included in x86_basic_regs.
2015-02-26 17:43:10 +09:00
4aa8ba2eef sort x86_basic_regs into user_regs_struct's order 2015-02-26 17:43:10 +09:00
fab2c2aa97 wrap x86_regs with x86_user_context
and, rename x86_regs to x86_basic_regs.
2015-02-26 17:43:10 +09:00
026164eda4 fix PTRACE_ATTACH, PTRACE_DETACH, detach at tracer process terminated.
tracee process may have no parent, increment/decrement refcount.

refs #374
refs #280
2015-02-25 21:09:44 +09:00
e91d1e5b7b stack of signal handler is not 16 byte align
refs #429
2015-02-24 17:20:52 +09:00
73743eeeb0 temporary fix for waiting tracee blocked 2015-02-24 15:20:32 +09:00
c1c1fd578a modify file path of /proc files
LTP getsid02 mount06 msgctl08 msgget03 pause02 pipe07 readhead02
    swapon03 sysconf01 wait402
2015-02-24 11:33:49 +09:00
f35cc66d18 delete unused argument "ctx" from do_syscall
support waitid option "WNOWAIT"
2015-02-23 17:14:14 +09:00
d9cf1d49b1 support waitid
send SIGCHLD to parent when SIGSTOP or SIGCONT received

refs #425
refs #283
2015-02-22 20:05:30 +09:00
3d426ada01 use remap_pfn_range() in rus_vm_fault() for kernel versions newer than 3.0 2015-02-19 13:52:55 -08:00
0307f6a6cc impementation of sched_{setparam, getparam, setscheduler, getscheduler, get_priority_min, get_priority_max, rr_get_interval} system calls 2015-02-19 11:46:03 -08:00
0dee04f16b move parse_args() to after arch_init()
In attached-mic, bootparam is not mapped until arch_init() is finished.
In builtin-mic and builtin-x86, virtual address of bootparam is changed
in arch_init().
2015-02-18 20:49:46 +09:00
0e98e87b95 change type of kprintf_lock() to "unsigned long"
to match type of ihk_mc_spinlock_lock().
2015-02-18 20:49:46 +09:00
d35e60c1a3 add init_boot_processor_local() for arch_start() 2015-02-18 20:49:46 +09:00
037e17c4ed fix parsing of "osnum=" kargs 2015-02-18 16:44:14 +09:00
2baf274dac fix PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE.
allocate debug registers area, for new process.
(gdb testsuite gdb.base/inferior-died.exp)

refs #266
refs #372
2015-02-18 16:20:23 +09:00
3b04043f2a change to throw signal SIGILL to SIGSEGV when GPE 2015-02-18 14:54:49 +09:00
c0edb6fe6f add new cpu state CPU_STATUS_RESERVED 2015-02-18 13:46:08 +09:00
bb137bc9bb make brk region just follow data region
This effectively reverts commit d70dd2338c.
2015-02-18 11:52:15 +09:00
16af976a71 support msync() system call. refs #382
Msync(2) of this version writes only the pages which the calling process
modified. Modifications of the other processes are not written.
2015-02-18 11:52:15 +09:00
6485578a7f sched_yield implementation 2015-02-17 16:20:51 -08:00
d2d0fc6721 The mcexec command became executable from a command-line at the same time 2015-02-17 18:33:38 +09:00
9574a28a5f The same CPU is assigned to a different process.
refs #423
2015-02-17 18:27:46 +09:00
dbe4ec3247 support PTRACE_O_TRACECLONE and PTRACE_O_TRACEEXEC. 2015-02-17 17:00:48 +09:00
99debc548f detach traced process, when tracer process terminate.
some fixes on PTRACE_DETACH.

refs #374
refs #280
2015-02-17 16:58:29 +09:00
fa15f6b106 support PTRACE_SYSCALL.
support PTRACE_O_TRACESYSGOOD.
ptrace_report_exec() calls ptrace_report_signal().

refs #265
2015-02-17 16:56:27 +09:00
8568a73f33 traced process should stop by any signal except for SIGKILL,
even if SIG_IGN.  (LTP ptrace01)
2015-02-17 16:51:29 +09:00
8b57b2ee57 change signal handling at mcexec 2015-02-15 17:54:11 +09:00
9a36e7b84a restart waitpid if it returns with EINTR. 2015-02-13 16:00:40 +09:00
d998691425 fix setpgid(0, 0) 2015-02-13 13:51:00 +09:00
8cdf70c500 Enable AVX extensions for processors that support it. 2015-02-12 17:51:50 -08:00
0e0bc548f6 fix mcexec SIG_IGN 2015-02-12 19:02:58 +09:00
d21ae28843 add dummy NUMA system calls. refs #405
ENOSYS system call handlers for the following.
- get_mempolicy()
- mbind()
- migrate_pages()
- move_pages()
- set_mempolicy()
2015-02-10 21:16:19 +09:00
a4a806bef7 support vsyscall_getcpu() vsyscall. refs #385
This version simply calls getcpu() system call, so that it's not fast.
2015-02-10 18:35:48 +09:00
d30d8fe71c support getcpu() system call. refs #385
It appeared on Linux(x86) in kernel 3.1.
2015-02-10 18:35:41 +09:00
a5bdd41c3d procfs: check parent entry to avoid page fault in procfs_exit() 2015-01-31 22:27:13 -08:00
5f5ab34559 support PTRACE_ATTACH.
fix PTRACE_TRACEME, PTRACE_DETACH.
2015-01-30 21:02:01 +09:00
b26fa4e87c wrong send signal to sender process when kill other process group (LTP kill10)
refs #404
2015-01-29 16:14:31 +09:00
bd5f43b119 support PTRACE_SINGLESTEP.
support debug/int3 exception.
2015-01-29 15:48:05 +09:00
f97f8dbab3 support PTRACE_PEEKTEXT and PTRACE_PEEKDATA.
support PTRACE_POKETEXT and PTRACE_POKEDATA.
  now, force write anywhere.
  read-only page must copy-on-write.
2015-01-29 15:02:15 +09:00
e30946f1f0 fix PTRACE_CONT may cause error.
refs #369
2015-01-29 14:10:31 +09:00
c3ade864d9 fix PTRACE_PEEKUSER, PTRACE_POKEUSER, PTRACE_GETREGS.
support PTRACE_SETREGS.
  In struct process, add 'unsigned long *ptrace_debugreg', instead of 'struct user *userp'.
  debug registers are read/written from/to ptrace_debugreg, save/restore in schedule().
  most general registers are proc->uctx.
  fs_base is proc->thread.tlsblock_base.
  gs_base,ds,es,fs,gs and orig_rax are uncompleted.
  other members in 'struct user' are ignored, same as Linux implementation.

refs #257
refs #373
refs #263
2015-01-29 14:08:38 +09:00
9c35935671 mcexec: fix memory allocation bug that crashes CentOS7 glibc 2015-01-27 16:55:30 +09:00
ed33ee65b2 CentOS7 spinlock, procfs and vm_munmap support (i.e., Linux kernel 3.10) 2015-01-27 16:55:28 +09:00
d04b5a09bd PTRACE_KILL omit sched_wakeup_process return
refs #369
2015-01-27 10:55:49 +09:00
08cc31f9bf support setrlimits/getrlimits, however this fix is these syscalls only.
checking resource process must implement it separately.

refs #330
2015-01-27 10:35:58 +09:00
cf2166f830 function enter_user_mode calls check_signal.
refs #392
2015-01-16 14:28:28 +09:00
765de119dc support PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, PTRACE_O_TRACEVFORKDONE.
to start with a SIGSTOP, do not set proc->ftn->status to PS_RUNNING in __runq_add_proc().
  change vfork() set CLONE_VFORK.

refs #266
refs #267
refs #372

support PTRACE_GETEVENTMSG.
  to store ptrace event, add 'unsigned long ptrace_eventmsg;' member in struct fork_tree_node.

refs #273
2015-01-14 10:43:18 +09:00
d46110b4d9 support PTRACE_DETACH.
change getppid() to use proc->ftn->ppid_parent->pid, for ptraced process.

refs #280
2015-01-08 12:39:52 +09:00
74f0aec478 skip copy_to_user() when r->ret is negative error number in mckernel_procfs_read().
refs #370
2015-01-08 12:38:06 +09:00
e3eb7e68bc Fix need to modify ihk/cokernel/Makefile when a file has been added under mckernel/arch (Bug#365) 2014-12-26 16:05:23 +09:00
912b8a886c do_kill distinguish PTRACE_CONT from kill. 2014-12-26 15:23:11 +09:00
e25d35a191 ihk_mc_init_ap(): cosmetics for reporting IKC, trampoline info 2014-12-25 11:05:52 +09:00
a9aad67541 IHK-SMP: boot scripts placeholder 2014-12-25 11:03:07 +09:00
cd6e663f48 handle VM_RESERVED (non-existing since Linux 3.7.0) and do_mmap_pgoff() (unexported since Linux 3.5.0) in mcctrl's syscall.c 2014-12-25 11:03:05 +09:00
5f095b3952 McKernel IHK SMP-x86 support (build system and config files) 2014-12-25 11:03:04 +09:00
811a275176 build scripts: support for separate build and source directories 2014-12-25 11:03:03 +09:00
b388f59ebd ihk_ikc_irq and ihk_ikc_irq_apicid 2014-12-25 11:03:01 +09:00
ff47261337 receive trampoline addr via parameter of arch_start() 2014-12-25 11:03:00 +09:00
a91bf9a13d ptrace: Make PTRACE_CONT/KILL debug print separated. 2014-12-24 12:39:29 +09:00
fcfa94cea1 ptrace: Add PTRACE_O_TRACEFORK (fake) support. 2014-12-24 12:39:13 +09:00
55f7ee1526 fix a warning
| mckernel/kernel/../arch/x86/kernel/memory.c: In function '__set_pt_page':
| mckernel/kernel/../arch/x86/kernel/memory.c:367:
|     warning: 'init_pt_lock_flags' may be used uninitialized in this function
2014-12-22 17:03:32 +09:00
b1b6fab7b8 fix a warning
| mckernel/kernel/host.c: In function 'syscall_packet_handler':
| mckernel/kernel/host.c:504:
|     warning: implicit declaration of function 'find_command_line'
2014-12-22 16:58:08 +09:00
391886a6f1 fix a warning
| mckernel/kernel/syscall.c: In function 'do_syscall':
| mckernel/kernel/syscall.c:187:
|     warning: 'irqstate' may be used uninitialized in this function
2014-12-22 16:58:07 +09:00
c810afe224 fix a warning
| mckernel/kernel/syscall.c: In function 'sys_madvise':
| mckernel/kernel/syscall.c:2108:
|     warning: 'range' may be used uninitialized in this function
2014-12-22 16:58:06 +09:00
5566ed1a63 fix a warning
| mckernel/executer/kernel/control.c: In function ‘release_handler’:
| mckernel/executer/kernel/control.c:264: warning: unused variable ‘c’
2014-12-22 16:58:05 +09:00
f0f91d2246 fix a warning
| mckernel/executer/kernel/control.c: In function ‘mcexec_debug_log’:
| mckernel/executer/kernel/control.c:252: warning: unused variable ‘c’
2014-12-22 16:58:04 +09:00
0942bf0ce0 make dkprintf() evaluate its parameters always
Parameters of dkprintf() should be evaluated even if dkprintf() is
disabled.  Because this enables to find expression of parameter obsolete
and to avoid unnecessary compiler warnings such as "unused variable".
2014-12-22 16:58:03 +09:00
9c94e90007 use ftn->tid instead of proc->tid 2014-12-22 16:58:02 +09:00
a6ac906105 use ftn->pid instead of proc->pid 2014-12-22 16:58:01 +09:00
d4ba4dc8b3 introduction of mckernel_procfs_file_operations; fix /proc/self path resolution;
implementation of /proc/self/pagemap (LTP mmap12)
2014-12-15 12:46:05 +09:00
815d907ca4 setpgid return -EACCES when the child process had already performed an execve (LTP setpgid03) 2014-12-09 14:01:20 +09:00
3c24315f91 support for /proc/mcos%d/PID/maps (without file info) (LTP mlock03) 2014-12-05 16:29:20 +09:00
25f108bf78 mckernel_procfs_read(): fix buffer allocation, offset check and return code 2014-12-05 16:27:48 +09:00
cc9d30efbf do_signal(): support for SIGSYS
as of POSIX.1-2001:
Signal  Value       Action  Comment
---------------------------------------------------
SIGSYS  12,31,12    Core    Bad argument to routine
2014-12-04 18:10:10 +09:00
af83f1be64 rlimit(RLIMIT_NOFILE): return one less to make sure sync pipe can be created (LTP fork09) 2014-12-04 17:40:00 +09:00
b2cab453f1 clone(): do not allow setting CLONE_THREAD and CLONE_VM separately
XXX: When CLONE_VM is set but CLONE_THREAD is not the new thread is
meant to have its own thread group, i.e., when calling exit_group()
the cloner thread wouldn't be killed. However, this is a problem on
the Linux side because we do not invoke clone in mcexec when threads
are created. Thus, currently no support for this combination is
provided.
2014-12-04 16:55:18 +09:00
8909597499 clone(): support for handling CLONE_SIGHAND and CLONE_VM flags separately 2014-12-04 16:55:17 +09:00
86f2a9067b getppid() implementation 2014-12-04 16:55:17 +09:00
a5889fb5df sigaction check signal number (LTP sigaction02) 2014-12-04 11:31:50 +09:00
f1a86cfbd3 when host mcexec down, syscall is hung up 2014-12-04 11:17:29 +09:00
c1cf630a94 mcexec: store full path to executable
required so that a forked process can obtain exec reference in the
Linux kernel even if executable was specified with relative path
and fork was called after changing the current working directory
2014-12-03 15:14:26 +09:00
8f30e16976 when mcexec is killed by SIGKILL, terminate mckernel process (BUG#259) 2014-11-27 16:13:52 +09:00
58e2e0a246 Use pidof in mcreboot script 2014-11-23 17:54:14 +09:00
ea02628f2b Add reboot and shutdown script for builtin-x86
It decides the number of cores for McKernel by looking into the
"SHIMOS: CPU Status:" line of dmesg. It sets the amount of memory for
McKernel to one urth of the total memory obtained by "free -g".
2014-11-13 20:06:29 +09:00
89acf5c5d6 support for AT_RANDOM auxiliary entry on the process stack (needed for _dl_random in glibc) 2014-11-11 08:48:27 +09:00
ac8e2a0c40 handle VM_RESERVED (non-existing since Linux 3.7.0) and do_mmap_pgoff() (unexported since Linux 3.5.0) in mcctrl's syscall.c 2014-11-11 08:42:07 +09:00
ab7aa3354f repair signal implementation.
- Don't intrrupt syscall with the ignored signal.
2014-11-07 07:55:30 +09:00
c4e0b84792 repair signal implementation.
- can not interrupt syscall
- can not recieve SIGKILL
2014-10-31 16:34:59 +09:00
131 changed files with 29710 additions and 6732 deletions

View File

@ -3,10 +3,11 @@ SBINDIR = @SBINDIR@
MANDIR = @MANDIR@
all::
@(cd executer/kernel; make modules)
@(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make) \
;; \
*) \
@ -16,10 +17,11 @@ all::
esac
install::
@(cd executer/kernel; make install)
@(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make install) \
;; \
*) \
@ -27,19 +29,40 @@ install::
exit 1 \
;; \
esac
if [ "$(TARGET)" = attached-mic ]; then \
@case "$(TARGET)" in \
attached-mic) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-attached-mic.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-attached-mic.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
fi
;; \
builtin-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-builtin-x86.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-builtin-x86.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
smp-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
exit 1 \
;; \
esac
clean::
@(cd executer/kernel; make clean)
@(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make clean) \
;; \
*) \

View File

@ -0,0 +1,2 @@
IHK_OBJS += cpu.o interrupt.o memory.o trampoline.o local.o context.o
IHK_OBJS += perfctr.o syscall.o vsyscall.o

View File

@ -10,7 +10,7 @@
* HISTORY
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...)
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
@ -78,15 +78,16 @@ int get_prstatus_size(void)
* \brief Fill a prstatus structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs0 A pointer to a x86_regs structure.
*/
void fill_prstatus(struct note *head, struct process *proc, void *regs0)
void fill_prstatus(struct note *head, struct thread *thread, void *regs0)
{
void *name;
struct elf_prstatus64 *prstatus;
struct x86_regs *regs = regs0;
struct x86_user_context *uctx = regs0;
struct x86_basic_regs *regs = &uctx->gpr;
register unsigned long _r12 asm("r12");
register unsigned long _r13 asm("r13");
register unsigned long _r14 asm("r14");
@ -159,11 +160,11 @@ int get_prpsinfo_size(void)
* \brief Fill a prpsinfo structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
void fill_prpsinfo(struct note *head, struct thread *thread, void *regs)
{
void *name;
struct elf_prpsinfo64 *prpsinfo;
@ -175,8 +176,8 @@ void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
memcpy(name, "CORE", sizeof("CORE"));
prpsinfo = (struct elf_prpsinfo64 *)(name + align32(sizeof("CORE")));
prpsinfo->pr_state = proc->ftn->status;
prpsinfo->pr_pid = proc->ftn->pid;
prpsinfo->pr_state = thread->status;
prpsinfo->pr_pid = thread->proc->pid;
/*
We leave most of the fields unfilled.
@ -209,11 +210,11 @@ int get_auxv_size(void)
* \brief Fill an AUXV structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_auxv(struct note *head, struct process *proc, void *regs)
void fill_auxv(struct note *head, struct thread *thread, void *regs)
{
void *name;
void *auxv;
@ -224,7 +225,7 @@ void fill_auxv(struct note *head, struct process *proc, void *regs)
name = (void *) (head + 1);
memcpy(name, "CORE", sizeof("CORE"));
auxv = name + align32(sizeof("CORE"));
memcpy(auxv, proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
memcpy(auxv, thread->proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
}
/**
@ -242,23 +243,23 @@ int get_note_size(void)
* \brief Fill the NOTE segment.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_note(void *note, struct process *proc, void *regs)
void fill_note(void *note, struct thread *thread, void *regs)
{
fill_prstatus(note, proc, regs);
fill_prstatus(note, thread, regs);
note += get_prstatus_size();
fill_prpsinfo(note, proc, regs);
fill_prpsinfo(note, thread, regs);
note += get_prpsinfo_size();
fill_auxv(note, proc, regs);
fill_auxv(note, thread, regs);
}
/**
* \brief Generate an image of the core file.
*
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
* \param coretable(out) An array of core chunks.
* \param chunks(out) Number of the entires of coretable.
@ -270,7 +271,18 @@ void fill_note(void *note, struct process *proc, void *regs)
* should be zero.
*/
int gencore(struct process *proc, void *regs,
/*@
@ requires \valid(thread);
@ requires \valid(regs);
@ requires \valid(coretable);
@ requires \valid(chunks);
@ behavior success:
@ ensures \result == 0;
@ assigns coretable;
@ behavior failure:
@ ensures \result == -1;
@*/
int gencore(struct thread *thread, void *regs,
struct coretable **coretable, int *chunks)
{
struct coretable *ct = NULL;
@ -278,7 +290,7 @@ int gencore(struct process *proc, void *regs,
Elf64_Phdr *ph = NULL;
void *note = NULL;
struct vm_range *range;
struct process_vm *vm = proc->vm;
struct process_vm *vm = thread->vm;
int segs = 1; /* the first one is for NOTE */
int notesize, phsize, alignednotesize;
unsigned int offset = 0;
@ -305,7 +317,7 @@ int gencore(struct process *proc, void *regs,
unsigned long p, phys;
int prevzero = 0;
for (p = range->start; p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
prevzero = 1;
} else {
@ -325,7 +337,7 @@ int gencore(struct process *proc, void *regs,
dkprintf("we have %d segs and %d chunks.\n\n", segs, *chunks);
{
struct vm_regions region = proc->vm->region;
struct vm_regions region = thread->vm->region;
dkprintf("text: %lx-%lx\n", region.text_start, region.text_end);
dkprintf("data: %lx-%lx\n", region.data_start, region.data_end);
@ -363,7 +375,7 @@ int gencore(struct process *proc, void *regs,
goto fail;
}
memset(note, 0, alignednotesize);
fill_note(note, proc, regs);
fill_note(note, thread, regs);
/* prgram header for NOTE segment is exceptional */
ph[0].p_type = PT_NOTE;
@ -433,7 +445,7 @@ int gencore(struct process *proc, void *regs,
for (start = p = range->start;
p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
if (prevzero == 0) {
/* We begin a new chunk */
@ -471,9 +483,9 @@ int gencore(struct process *proc, void *regs,
i++;
}
} else {
if ((proc->vm->region.user_start <= range->start) &&
(range->end <= proc->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if ((thread->vm->region.user_start <= range->start) &&
(range->end <= thread->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)range->start, &phys) != 0) {
dkprintf("could not convert user virtual address %lx"
"to physical address", range->start);
@ -509,6 +521,10 @@ int gencore(struct process *proc, void *regs,
* \param coretable An array of core chunks.
*/
/*@
@ requires \valid(coretable);
@ assigns \nothing;
@*/
void freecore(struct coretable **coretable)
{
struct coretable *ct = *coretable;

View File

@ -0,0 +1,96 @@
/**
* \file arch-bitops.h
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#endif

View File

@ -0,0 +1,67 @@
/**
* \file futex.h
* Licence details are found in the file LICENSE.
*
* \brief
* Futex adaptation to McKernel
*
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
*
*
* HISTORY:
*
*/
#ifndef _ARCH_FUTEX_H
#define _ARCH_FUTEX_H
#include <asm.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -5,15 +5,20 @@
#define __HEADER_X86_COMMON_ARCH_LOCK
#include <ihk/cpu.h>
#include <ihk/atomic.h>
//#define DEBUG_SPINLOCK
//#define DEBUG_MCS_RWLOCK
#ifdef DEBUG_SPINLOCK
#if defined(DEBUG_SPINLOCK) || defined(DEBUG_MCS_RWLOCK)
int __kprintf(const char *format, ...);
#endif
typedef int ihk_spinlock_t;
extern void preempt_enable(void);
extern void preempt_disable(void);
#define IHK_STATIC_SPINLOCK_FUNCS
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
@ -22,7 +27,17 @@ static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
}
#define SPIN_LOCK_UNLOCKED 0
static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_lock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
#endif
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
@ -41,10 +56,8 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
: "+Q" (inc), "+m" (*lock), "=r" (tmp) : : "memory", "cc");
#endif
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] trying to grab lock: 0x%lX\n",
ihk_mc_get_processor_id(), lock);
#endif
preempt_disable();
asm volatile("lock; xaddl %0, %1\n"
"movzwl %w0, %2\n\t"
"shrl $16, %0\n\t"
@ -60,36 +73,455 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
:
: "memory", "cc");
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] holding lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
#endif
}
static unsigned long ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock(l) ({ unsigned long rc;\
__kprintf("[%d] call ihk_mc_spinlock_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
rc = __ihk_mc_spinlock_lock(l);\
__kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
})
#else
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
#endif
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
{
unsigned long flags;
flags = cpu_disable_interrupt_save();
ihk_mc_spinlock_lock_noirq(lock);
__ihk_mc_spinlock_lock_noirq(lock);
return flags;
}
static void ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
#endif
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
{
asm volatile ("lock incw %0" : "+m"(*lock) : : "memory", "cc");
preempt_enable();
}
static void ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock(l, f) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock((l), (f)); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
#endif
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
{
ihk_mc_spinlock_unlock_noirq(lock);
__ihk_mc_spinlock_unlock_noirq(lock);
cpu_restore_interrupt(flags);
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] released lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
}
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
} __attribute__((aligned(64))) mcs_lock_node_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;
node->next = NULL;
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
node->next = NULL;
node->locked = 0;
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
(unsigned long)node);
if (pred) {
node->locked = 1;
pred->next = node;
while (node->locked != 0) {
cpu_pause();
}
}
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
struct mcs_lock_node *old = (struct mcs_lock_node *)
atomic_cmpxchg8((unsigned long *)&lock->next,
(unsigned long)node, (unsigned long)0);
if (old == node) {
return;
}
while (node->next == NULL) {
cpu_pause();
}
}
node->next->locked = 0;
}
// reader/writer lock
typedef struct mcs_rwlock_node {
ihk_atomic_t count; // num of readers (use only common reader)
char type; // lock type
#define MCS_RWLOCK_TYPE_COMMON_READER 0
#define MCS_RWLOCK_TYPE_READER 1
#define MCS_RWLOCK_TYPE_WRITER 2
char locked; // lock
#define MCS_RWLOCK_LOCKED 1
#define MCS_RWLOCK_UNLOCKED 0
char dmy1; // unused
char dmy2; // unused
struct mcs_rwlock_node *next;
} __attribute__((aligned(64))) mcs_rwlock_node_t;
typedef struct mcs_rwlock_node_irqsave {
struct mcs_rwlock_node node;
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_rwlock_node_irqsave_t;
typedef struct mcs_rwlock_lock {
struct mcs_rwlock_node reader; /* common reader lock */
struct mcs_rwlock_node *node; /* base */
} __attribute__((aligned(64))) mcs_rwlock_lock_t;
static void
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
{
ihk_atomic_set(&lock->reader.count, 0);
lock->reader.type = MCS_RWLOCK_TYPE_COMMON_READER;
lock->node = NULL;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
#endif
static void
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_WRITER;
node->next = NULL;
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
}
static void
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
{
struct mcs_rwlock_node *p;
struct mcs_rwlock_node *f = NULL;
struct mcs_rwlock_node *n;
int breakf = 0;
ihk_atomic_inc(&lock->reader.count); // protect to unlock reader
for(p = &lock->reader; p->next; p = n){
n = p->next;
if(p->next->type == MCS_RWLOCK_TYPE_READER){
p->next = n->next;
if(lock->node == n){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)n,
(unsigned long)p);
if(old != n){ // couldn't change
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
else{
breakf = 1;
}
}
else if(p->next == NULL){
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
if(f){
ihk_atomic_inc(&lock->reader.count);
n->locked = MCS_RWLOCK_UNLOCKED;
}
else
f = n;
n = p;
if(breakf)
break;
}
if(n->next == NULL && lock->node != n){
while (n->next == NULL && lock->node != n) {
cpu_pause();
}
}
}
f->locked = MCS_RWLOCK_UNLOCKED;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
#endif
static void
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if (node->next == NULL) {
struct mcs_rwlock_node *old = (struct mcs_rwlock_node *)
atomic_cmpxchg8((unsigned long *)&lock->node,
(unsigned long)node, (unsigned long)0);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
}
if(node->next->type == MCS_RWLOCK_TYPE_READER){
lock->reader.next = node->next;
mcs_rwlock_unlock_readers(lock);
}
else{
node->next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq
#endif
static inline unsigned int
atomic_inc_ifnot0(ihk_atomic_t *v)
{
unsigned int *p = (unsigned int *)(&(v)->counter);
unsigned int old;
unsigned int new;
unsigned int val;
do{
if(!(old = *p))
break;
new = old + 1;
val = atomic_cmpxchg4(p, old, new);
}while(val != old);
return old;
}
static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_READER;
node->next = NULL;
node->dmy1 = ihk_mc_get_processor_id();
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
if(pred == &lock->reader){
if(atomic_inc_ifnot0(&pred->count)){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)node,
(unsigned long)pred);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
node->locked = MCS_RWLOCK_LOCKED;
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
ihk_atomic_dec(&pred->count);
goto out;
}
}
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
else {
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
}
out:
return;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
#endif
static void
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if(ihk_atomic_dec_return(&lock->reader.count))
goto out;
if (lock->reader.next == NULL) {
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)&lock->reader,
(unsigned long)0);
if (old == &lock->reader) {
goto out;
}
while (lock->reader.next == NULL) {
cpu_pause();
}
}
if(lock->reader.next->type == MCS_RWLOCK_TYPE_READER){
mcs_rwlock_unlock_readers(lock);
}
else{
lock->reader.next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
#endif
static void
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_writer_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
#endif
static void
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_writer_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
#endif
static void
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_reader_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
#endif
static void
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_reader_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#endif

View File

@ -5,6 +5,8 @@
* Define and declare memory management macros and functions
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -20,6 +22,7 @@
#define USER_CS_ENTRY 6
#define USER_DS_ENTRY 7
#define GLOBAL_TSS_ENTRY 8
#define GETCPU_ENTRY 15
#define KERNEL_CS (KERNEL_CS_ENTRY * 8)
#define KERNEL_DS (KERNEL_DS_ENTRY * 8)
@ -38,10 +41,12 @@
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
#define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
@ -63,6 +68,8 @@
#define PF_PRESENT ((pte_t)0x01) /* entry is valid */
#define PF_WRITABLE ((pte_t)0x02)
#define PFLX_PWT ((pte_t)0x08)
#define PFLX_PCD ((pte_t)0x10)
#define PF_SIZE ((pte_t)0x80) /* entry points large page */
#define PFL4_PRESENT ((pte_t)0x01)
@ -72,8 +79,8 @@
#define PFL3_PRESENT ((pte_t)0x01)
#define PFL3_WRITABLE ((pte_t)0x02)
#define PFL3_USER ((pte_t)0x04)
#define PFL3_PWT ((pte_t)0x08)
#define PFL3_PCD ((pte_t)0x10)
#define PFL3_PWT PFLX_PWT
#define PFL3_PCD PFLX_PCD
#define PFL3_ACCESSED ((pte_t)0x20)
#define PFL3_DIRTY ((pte_t)0x40)
#define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */
@ -84,8 +91,8 @@
#define PFL2_PRESENT ((pte_t)0x01)
#define PFL2_WRITABLE ((pte_t)0x02)
#define PFL2_USER ((pte_t)0x04)
#define PFL2_PWT ((pte_t)0x08)
#define PFL2_PCD ((pte_t)0x10)
#define PFL2_PWT PFLX_PWT
#define PFL2_PCD PFLX_PCD
#define PFL2_ACCESSED ((pte_t)0x20)
#define PFL2_DIRTY ((pte_t)0x40)
#define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */
@ -96,8 +103,8 @@
#define PFL1_PRESENT ((pte_t)0x01)
#define PFL1_WRITABLE ((pte_t)0x02)
#define PFL1_USER ((pte_t)0x04)
#define PFL1_PWT ((pte_t)0x08)
#define PFL1_PCD ((pte_t)0x10)
#define PFL1_PWT PFLX_PWT
#define PFL1_PCD PFLX_PCD
#define PFL1_ACCESSED ((pte_t)0x20)
#define PFL1_DIRTY ((pte_t)0x40)
#define PFL1_IGNORED_11 ((pte_t)1 << 11)
@ -117,6 +124,25 @@
#define PTE_NULL ((pte_t)0)
typedef unsigned long pte_t;
/*
* pagemap kernel ABI bits
*/
#define PM_ENTRY_BYTES sizeof(uint64_t)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
#define PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
/* For easy conversion, it is better to be the same as architecture's ones */
enum ihk_mc_pt_attribute {
PTATTR_ACTIVE = 0x01,
@ -128,8 +154,11 @@ enum ihk_mc_pt_attribute {
PTATTR_NO_EXECUTE = 0x8000000000000000,
PTATTR_UNCACHABLE = 0x10000,
PTATTR_FOR_USER = 0x20000,
PTATTR_WRITE_COMBINED = 0x40000,
};
enum ihk_mc_pt_attribute attr_mask;
static inline int pte_is_null(pte_t *ptep)
{
return (*ptep == PTE_NULL);
@ -185,6 +214,33 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize)
return (off_t)(*ptep & PAGE_MASK);
}
static inline enum ihk_mc_pt_attribute pte_get_attr(pte_t *ptep, size_t pgsize)
{
enum ihk_mc_pt_attribute attr;
attr = *ptep & attr_mask;
if (*ptep & PFLX_PWT) {
if (*ptep & PFLX_PCD) {
attr |= PTATTR_UNCACHABLE;
}
else {
attr |= PTATTR_WRITE_COMBINED;
}
}
if (((pgsize == PTL2_SIZE) && (*ptep & PFL2_SIZE))
|| ((pgsize == PTL3_SIZE) && (*ptep & PFL3_SIZE))) {
attr |= PTATTR_LARGEPAGE;
}
return attr;
} /* pte_get_attr() */
static inline void pte_make_null(pte_t *ptep, size_t pgsize)
{
*ptep = PTE_NULL;
return;
}
static inline void pte_make_fileoff(off_t off,
enum ihk_mc_pt_attribute ptattr, size_t pgsize, pte_t *ptep)
{
@ -216,6 +272,36 @@ static inline void pte_xchg(pte_t *ptep, pte_t *valp)
#define pte_xchg(p,vp) do { *(vp) = xchg((p), *(vp)); } while (0)
#endif
static inline void pte_clear_dirty(pte_t *ptep, size_t pgsize)
{
uint64_t mask;
switch (pgsize) {
default: /* through */
case PTL1_SIZE: mask = ~PFL1_DIRTY; break;
case PTL2_SIZE: mask = ~PFL2_DIRTY; break;
case PTL3_SIZE: mask = ~PFL3_DIRTY; break;
}
asm volatile ("lock andq %0,%1" :: "r"(mask), "m"(*ptep));
return;
}
static inline void pte_set_dirty(pte_t *ptep, size_t pgsize)
{
uint64_t mask;
switch (pgsize) {
default: /* through */
case PTL1_SIZE: mask = PFL1_DIRTY; break;
case PTL2_SIZE: mask = PFL2_DIRTY; break;
case PTL3_SIZE: mask = PFL3_DIRTY; break;
}
asm volatile ("lock orq %0,%1" :: "r"(mask), "m"(*ptep));
return;
}
struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
@ -227,8 +313,9 @@ void flush_tlb_single(unsigned long addr);
void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable);
#define AP_TRAMPOLINE 0x10000
#define AP_TRAMPOLINE_SIZE 0x4000
extern unsigned long ap_trampoline;
//#define AP_TRAMPOLINE 0x10000
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)

View File

@ -0,0 +1,18 @@
/**
* \file auxvec.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent constants for auxiliary vector
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_AUXVEC_H
#define ARCH_AUXVEC_H
#define AT_SYSINFO_EHDR 33
#endif

View File

@ -0,0 +1,37 @@
/**
* \file cpu.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent types and functions to control CPU.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_CPU_H
#define ARCH_CPU_H
#include <ihk/cpu.h>
static inline void rmb(void)
{
barrier();
}
static inline void wmb(void)
{
barrier();
}
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
#endif /* ARCH_CPU_H */

View File

@ -0,0 +1,16 @@
#ifndef __ARCH_MM_H
#define __ARCH_MM_H
struct process_vm;
static inline void
flush_nfo_tlb()
{
}
static inline void
flush_nfo_tlb_mm(struct process_vm *vm)
{
}
#endif

View File

@ -0,0 +1,40 @@
/**
* \file mman.h
* License details are found in the file LICENSE.
* \brief
* memory management declarations
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef HEADER_ARCH_MMAN_H
#define HEADER_ARCH_MMAN_H
/*
* mapping flags
*/
#define MAP_32BIT 0x40
#define MAP_GROWSDOWN 0x0100
#define MAP_DENYWRITE 0x0800
#define MAP_EXECUTABLE 0x1000
#define MAP_LOCKED 0x2000
#define MAP_NORESERVE 0x4000
#define MAP_POPULATE 0x8000
#define MAP_NONBLOCK 0x00010000
#define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
/*
* for mlockall()
*/
#define MCL_CURRENT 0x01
#define MCL_FUTURE 0x02
#endif /* HEADER_ARCH_MMAN_H */

View File

@ -0,0 +1,46 @@
/**
* \file shm.h
* License details are found in the file LICENSE.
* \brief
* header file for System V shared memory
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef HEADER_ARCH_SHM_H
#define HEADER_ARCH_SHM_H
/* shmflg */
#define SHM_HUGE_SHIFT 26
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
struct ipc_perm {
key_t key;
uid_t uid;
gid_t gid;
uid_t cuid;
gid_t cgid;
uint16_t mode;
uint8_t padding[2];
uint16_t seq;
uint8_t padding2[22];
};
struct shmid_ds {
struct ipc_perm shm_perm;
size_t shm_segsz;
time_t shm_atime;
time_t shm_dtime;
time_t shm_ctime;
pid_t shm_cpid;
pid_t shm_lpid;
uint64_t shm_nattch;
uint8_t padding[12];
int init_pgshift;
};
#endif /* HEADER_ARCH_SHM_H */

View File

@ -22,7 +22,7 @@
* - 4096 : kernel stack
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_CPU_LOCAL_OFFSET_KSTACK 16
#define X86_CPU_LOCAL_OFFSET_USTACK 24
@ -39,10 +39,13 @@ struct x86_cpu_local_variables {
struct x86_desc_ptr gdt_ptr;
unsigned short pad[3];
/* 48 */
uint64_t gdt[10];
/* 128 */
uint64_t gdt[16];
/* 176 */
struct tss64 tss;
/* 280 */
unsigned long paniced;
uint64_t panic_regs[21];
/* 456 */
} __attribute__((packed));
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id);

View File

@ -1,40 +1,7 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#ifndef _ERRNO_BASE_H
#define _ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#include <generic-errno.h>
#define EDEADLK 35 /* Resource deadlock would occur */
#define ENAMETOOLONG 36 /* File name too long */
@ -141,29 +108,4 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif

View File

@ -13,6 +13,10 @@
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
#define HEADER_X86_COMMON_IHK_ATOMIC_H
/***********************************************************************
* ihk_atomic_t
*/
typedef struct {
int counter;
} ihk_atomic_t;
@ -95,6 +99,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define ihk_atomic_inc_return(v) (ihk_atomic_add_return(1, v))
#define ihk_atomic_dec_return(v) (ihk_atomic_sub_return(1, v))
/***********************************************************************
* ihk_atomic64_t
*/
typedef struct {
long counter64;
} ihk_atomic64_t;
#define IHK_ATOMIC64_INIT(i) { .counter64 = (i) }
static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
{
return *(volatile long *)&(v)->counter64;
}
static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
{
asm volatile ("lock incq %0" : "+m"(v->counter64));
}
/***********************************************************************
* others
*/
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
* Note 2: xchg has side effect, so that attribute volatile is necessary,
@ -112,6 +140,17 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
__x; \
})
static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
{
unsigned long __x = (x);
asm volatile("xchgq %0,%1"
: "=r" (__x)
: "m" (*(volatile unsigned long*)(ptr)), "0" (__x)
: "memory");
return __x;
}
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
@ -150,5 +189,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define xchg(ptr, v) \
__xchg((v), (ptr), sizeof(*ptr))
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
unsigned long oldval,
unsigned long newval)
{
asm volatile("lock; cmpxchgq %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
unsigned int oldval,
unsigned int newval)
{
asm volatile("lock; cmpxchgl %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -22,19 +22,35 @@ struct x86_kregs {
};
typedef struct x86_kregs ihk_mc_kernel_context_t;
/* XXX: User context should contain floating point registers */
typedef struct x86_regs ihk_mc_user_context_t;
struct x86_user_context {
struct x86_sregs sr;
#define ihk_mc_syscall_arg0(uc) (uc)->rdi
#define ihk_mc_syscall_arg1(uc) (uc)->rsi
#define ihk_mc_syscall_arg2(uc) (uc)->rdx
#define ihk_mc_syscall_arg3(uc) (uc)->r10
#define ihk_mc_syscall_arg4(uc) (uc)->r8
#define ihk_mc_syscall_arg5(uc) (uc)->r9
/* 16-byte boundary here */
uint8_t is_gpr_valid;
uint8_t is_sr_valid;
uint8_t spare_flags6;
uint8_t spare_flags5;
uint8_t spare_flags4;
uint8_t spare_flags3;
uint8_t spare_flags2;
uint8_t spare_flags1;
struct x86_basic_regs gpr; /* must be last */
/* 16-byte boundary here */
};
typedef struct x86_user_context ihk_mc_user_context_t;
#define ihk_mc_syscall_ret(uc) (uc)->rax
#define ihk_mc_syscall_arg0(uc) (uc)->gpr.rdi
#define ihk_mc_syscall_arg1(uc) (uc)->gpr.rsi
#define ihk_mc_syscall_arg2(uc) (uc)->gpr.rdx
#define ihk_mc_syscall_arg3(uc) (uc)->gpr.r10
#define ihk_mc_syscall_arg4(uc) (uc)->gpr.r8
#define ihk_mc_syscall_arg5(uc) (uc)->gpr.r9
#define ihk_mc_syscall_pc(uc) (uc)->rip
#define ihk_mc_syscall_sp(uc) (uc)->rsp
#define ihk_mc_syscall_ret(uc) (uc)->gpr.rax
#define ihk_mc_syscall_pc(uc) (uc)->gpr.rip
#define ihk_mc_syscall_sp(uc) (uc)->gpr.rsp
#endif

View File

@ -31,9 +31,5 @@ typedef int64_t off_t;
#define NULL ((void *)0)
#define BITS_PER_LONG_SHIFT 6
#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT)
#endif

View File

@ -0,0 +1,17 @@
/**
* \file prctl.h
* License details are found in the file LICENSE.
*/
/*
* HISTORY
*/
#ifndef __ARCH_PRCTL_H
#define __ARCH_PRCTL_H
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#endif

View File

@ -6,6 +6,8 @@
* Machine Specific Registers (MSR)
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -16,7 +18,31 @@
#include <types.h>
#define RFLAGS_CF (1 << 0)
#define RFLAGS_PF (1 << 2)
#define RFLAGS_AF (1 << 4)
#define RFLAGS_ZF (1 << 6)
#define RFLAGS_SF (1 << 7)
#define RFLAGS_TF (1 << 8)
#define RFLAGS_IF (1 << 9)
#define RFLAGS_DF (1 << 10)
#define RFLAGS_OF (1 << 11)
#define RFLAGS_IOPL (3 << 12)
#define RFLAGS_NT (1 << 14)
#define RFLAGS_RF (1 << 16)
#define RFLAGS_VM (1 << 17)
#define RFLAGS_AC (1 << 18)
#define RFLAGS_VIF (1 << 19)
#define RFLAGS_VIP (1 << 20)
#define RFLAGS_ID (1 << 21)
#define DB6_B0 (1 << 0)
#define DB6_B1 (1 << 1)
#define DB6_B2 (1 << 2)
#define DB6_B3 (1 << 3)
#define DB6_BD (1 << 13)
#define DB6_BS (1 << 14)
#define DB6_BT (1 << 15)
#define MSR_EFER 0xc0000080
#define MSR_STAR 0xc0000081
@ -26,6 +52,14 @@
#define MSR_GS_BASE 0xc0000101
#define MSR_IA32_APIC_BASE 0x000000001b
#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_IA32_PERF_CTL 0x00000199
#define MSR_IA32_MISC_ENABLE 0x000001a0
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
#define MSR_IA32_CR_PAT 0x00000277
#define MSR_IA32_XSS 0xda0
#define CVAL(event, mask) \
((((event) & 0xf00) << 24) | ((mask) << 8) | ((event) & 0xff))
@ -37,6 +71,25 @@
#define MSR_PERF_CTL_0 0xc0010000
#define MSR_PERF_CTR_0 0xc0010004
static unsigned long xgetbv(unsigned int index)
{
unsigned int low, high;
asm volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (index));
return low | ((unsigned long)high << 32);
}
static void xsetbv(unsigned int index, unsigned long val)
{
unsigned int low, high;
low = val;
high = val >> 32;
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
}
static void wrmsr(unsigned int idx, unsigned long value){
unsigned int high, low;
@ -135,10 +188,19 @@ struct tss64 {
unsigned short iomap_address;
} __attribute__((packed));
struct x86_regs {
unsigned long r15, r14, r13, r12, r11, r10, r9, r8;
unsigned long rdi, rsi, rdx, rcx, rbx, rax, rbp;
unsigned long error, rip, cs, rflags, rsp, ss;
struct x86_basic_regs {
unsigned long r15, r14, r13, r12, rbp, rbx, r11, r10;
unsigned long r9, r8, rax, rcx, rdx, rsi, rdi, error;
unsigned long rip, cs, rflags, rsp, ss;
};
struct x86_sregs {
unsigned long fs_base;
unsigned long gs_base;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
};
#define REGS_GET_STACK_POINTER(regs) (((struct x86_regs *)regs)->rsp)
@ -162,7 +224,72 @@ enum x86_pf_error_code {
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
PF_PATCH = 1 << 29,
PF_POPULATE = 1 << 30,
};
struct i387_fxsave_struct {
unsigned short cwd;
unsigned short swd;
unsigned short twd;
unsigned short fop;
union {
struct {
unsigned long rip;
unsigned long rdp;
};
struct {
unsigned int fip;
unsigned int fcs;
unsigned int foo;
unsigned int fos;
};
};
unsigned int mxcsr;
unsigned int mxcsr_mask;
unsigned int st_space[32];
unsigned int xmm_space[64];
unsigned int padding[12];
union {
unsigned int padding1[12];
unsigned int sw_reserved[12];
};
} __attribute__((aligned(16)));
struct ymmh_struct {
unsigned int ymmh_space[64];
};
struct lwp_struct {
unsigned char reserved[128];
};
struct bndreg {
unsigned long lower_bound;
unsigned long upper_bound;
} __attribute__((packed));
struct bndcsr {
unsigned long bndcfgu;
unsigned long bndstatus;
} __attribute__((packed));
struct xsave_hdr_struct {
unsigned long xstate_bv;
unsigned long xcomp_bv;
unsigned long reserved[6];
} __attribute__((packed));
struct xsave_struct {
struct i387_fxsave_struct i387;
struct xsave_hdr_struct xsave_hdr;
struct ymmh_struct ymmh;
struct lwp_struct lwp;
struct bndreg bndreg[4];
struct bndcsr bndcsr;
} __attribute__ ((packed, aligned (64)));
typedef struct xsave_struct fp_regs_struct;
#endif

View File

@ -90,10 +90,6 @@ enum __rlimit_resource
#define RLIM_NLIMITS __RLIM_NLIMITS
};
struct rlimit {
uint64_t rlim_cur; /* Soft limit */
uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#include <generic-rlimit.h>
#endif

View File

@ -20,12 +20,13 @@
* syscall_name[] only, no handler exists.
*/
SYSCALL_DELEGATED(0, read)
SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open)
SYSCALL_DELEGATED(3, close)
SYSCALL_HANDLED(3, close)
SYSCALL_DELEGATED(4, stat)
SYSCALL_DELEGATED(5, fstat)
SYSCALL_DELEGATED(7, poll)
SYSCALL_DELEGATED(8, lseek)
SYSCALL_HANDLED(9, mmap)
SYSCALL_HANDLED(10, mprotect)
@ -34,15 +35,24 @@ SYSCALL_HANDLED(12, brk)
SYSCALL_HANDLED(13, rt_sigaction)
SYSCALL_HANDLED(14, rt_sigprocmask)
SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_DELEGATED(16, ioctl)
SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev)
SYSCALL_DELEGATED(21, access)
SYSCALL_DELEGATED(23, select)
SYSCALL_HANDLED(24, sched_yield)
SYSCALL_HANDLED(25, mremap)
SYSCALL_HANDLED(26, msync)
SYSCALL_HANDLED(27, mincore)
SYSCALL_HANDLED(28, madvise)
SYSCALL_HANDLED(29, shmget)
SYSCALL_HANDLED(30, shmat)
SYSCALL_HANDLED(31, shmctl)
SYSCALL_HANDLED(34, pause)
SYSCALL_HANDLED(35, nanosleep)
SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork)
@ -52,43 +62,89 @@ SYSCALL_HANDLED(60, exit)
SYSCALL_HANDLED(61, wait4)
SYSCALL_HANDLED(62, kill)
SYSCALL_DELEGATED(63, uname)
SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_DELEGATED(96, gettimeofday)
SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace)
SYSCALL_DELEGATED(102, getuid)
SYSCALL_DELEGATED(104, getgid)
SYSCALL_DELEGATED(107, geteuid)
SYSCALL_DELEGATED(108, getegid)
SYSCALL_HANDLED(102, getuid)
SYSCALL_HANDLED(104, getgid)
SYSCALL_HANDLED(105, setuid)
SYSCALL_HANDLED(106, setgid)
SYSCALL_HANDLED(107, geteuid)
SYSCALL_HANDLED(108, getegid)
SYSCALL_HANDLED(109, setpgid)
SYSCALL_DELEGATED(110, getppid)
SYSCALL_HANDLED(110, getppid)
SYSCALL_DELEGATED(111, getpgrp)
SYSCALL_HANDLED(113, setreuid)
SYSCALL_HANDLED(114, setregid)
SYSCALL_HANDLED(117, setresuid)
SYSCALL_HANDLED(118, getresuid)
SYSCALL_HANDLED(119, setresgid)
SYSCALL_HANDLED(120, getresgid)
SYSCALL_HANDLED(122, setfsuid)
SYSCALL_HANDLED(123, setfsgid)
SYSCALL_HANDLED(127, rt_sigpending)
SYSCALL_HANDLED(128, rt_sigtimedwait)
SYSCALL_HANDLED(129, rt_sigqueueinfo)
SYSCALL_HANDLED(130, rt_sigsuspend)
SYSCALL_HANDLED(131, sigaltstack)
SYSCALL_HANDLED(142, sched_setparam)
SYSCALL_HANDLED(143, sched_getparam)
SYSCALL_HANDLED(144, sched_setscheduler)
SYSCALL_HANDLED(145, sched_getscheduler)
SYSCALL_HANDLED(146, sched_get_priority_max)
SYSCALL_HANDLED(147, sched_get_priority_min)
SYSCALL_HANDLED(148, sched_rr_get_interval)
SYSCALL_HANDLED(149, mlock)
SYSCALL_HANDLED(150, munlock)
SYSCALL_HANDLED(151, mlockall)
SYSCALL_HANDLED(152, munlockall)
SYSCALL_HANDLED(158, arch_prctl)
SYSCALL_HANDLED(160, setrlimit)
SYSCALL_HANDLED(164, settimeofday)
SYSCALL_HANDLED(186, gettid)
SYSCALL_HANDLED(200, tkill)
SYSCALL_DELEGATED(201, time)
SYSCALL_HANDLED(202, futex)
SYSCALL_HANDLED(203, sched_setaffinity)
SYSCALL_HANDLED(204, sched_getaffinity)
SYSCALL_DELEGATED(208, io_getevents)
SYSCALL_HANDLED(216, remap_file_pages)
SYSCALL_DELEGATED(217, getdents64)
SYSCALL_HANDLED(218, set_tid_address)
SYSCALL_DELEGATED(220, semtimedop)
SYSCALL_HANDLED(228, clock_gettime)
SYSCALL_DELEGATED(230, clock_nanosleep)
SYSCALL_HANDLED(231, exit_group)
SYSCALL_DELEGATED(232, epoll_wait)
SYSCALL_HANDLED(234, tgkill)
SYSCALL_HANDLED(237, mbind)
SYSCALL_HANDLED(238, set_mempolicy)
SYSCALL_HANDLED(239, get_mempolicy)
SYSCALL_HANDLED(247, waitid)
SYSCALL_HANDLED(256, migrate_pages)
SYSCALL_DELEGATED(270, pselect6)
SYSCALL_DELEGATED(271, ppoll)
SYSCALL_HANDLED(273, set_robust_list)
SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4)
SYSCALL_HANDLED(298, perf_event_open)
#ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call)
#endif
SYSCALL_HANDLED(309, getcpu)
SYSCALL_HANDLED(310, process_vm_readv)
SYSCALL_HANDLED(311, process_vm_writev)
SYSCALL_HANDLED(601, pmc_init)
SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)

View File

@ -13,7 +13,7 @@
* 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)
@ -24,39 +24,56 @@
#define USER_CS (48 + 3)
#define USER_DS (56 + 3)
#define PUSH_ALL_REGS \
pushq %rbp; \
pushq %rax; \
pushq %rbx; \
pushq %rcx; \
pushq %rdx; \
pushq %rsi; \
pushq %rdi; \
pushq %r8; \
pushq %r9; \
pushq %r10; \
pushq %r11; \
pushq %r12; \
pushq %r13; \
pushq %r14; \
pushq %r15;
#define POP_ALL_REGS \
popq %r15; \
popq %r14; \
popq %r13; \
popq %r12; \
popq %r11; \
popq %r10; \
popq %r9; \
popq %r8; \
popq %rdi; \
popq %rsi; \
popq %rdx; \
popq %rcx; \
popq %rbx; \
popq %rax; \
popq %rbp
/* struct x86_user_context */
#define X86_SREGS_BASE (0)
#define X86_SREGS_SIZE 48
#define X86_FLAGS_BASE (X86_SREGS_BASE + X86_SREGS_SIZE)
#define X86_FLAGS_SIZE 8
#define X86_REGS_BASE (X86_FLAGS_BASE + X86_FLAGS_SIZE)
#define RAX_OFFSET (X86_REGS_BASE + 80)
#define ERROR_OFFSET (X86_REGS_BASE + 120)
#define RSP_OFFSET (X86_REGS_BASE + 152)
#define PUSH_ALL_REGS \
pushq %rdi; \
pushq %rsi; \
pushq %rdx; \
pushq %rcx; \
pushq %rax; \
pushq %r8; \
pushq %r9; \
pushq %r10; \
pushq %r11; \
pushq %rbx; \
pushq %rbp; \
pushq %r12; \
pushq %r13; \
pushq %r14; \
pushq %r15; \
pushq $1; /* is_gpr_valid is set, and others are cleared */ \
subq $X86_FLAGS_BASE,%rsp /* for x86_sregs, etc. */
#define POP_ALL_REGS \
movq $0,X86_FLAGS_BASE(%rsp); /* clear all flags */ \
addq $X86_REGS_BASE,%rsp; /* discard x86_sregs, flags, etc. */ \
popq %r15; \
popq %r14; \
popq %r13; \
popq %r12; \
popq %rbp; \
popq %rbx; \
popq %r11; \
popq %r10; \
popq %r9; \
popq %r8; \
popq %rax; \
popq %rcx; \
popq %rdx; \
popq %rsi; \
popq %rdi
.data
.globl generic_common_handlers
generic_common_handlers:
@ -75,7 +92,7 @@ vector=vector+1
common_interrupt:
PUSH_ALL_REGS
movq 120(%rsp), %rdi
movq ERROR_OFFSET(%rsp), %rdi
movq %rsp, %rsi
call handle_interrupt /* Enter C code */
POP_ALL_REGS
@ -91,7 +108,7 @@ page_fault:
cld
PUSH_ALL_REGS
movq %cr2, %rdi
movq 120(%rsp),%rsi
movq ERROR_OFFSET(%rsp),%rsi
movq %rsp,%rdx
movq __page_fault_handler_address(%rip), %rax
andq %rax, %rax
@ -113,10 +130,53 @@ general_protection_exception:
addq $8, %rsp
iretq
.globl nmi
nmi:
#define PANICED 232
#define PANIC_REGS 240
movq %rax,%gs:PANIC_REGS+0x00
movq %rbx,%gs:PANIC_REGS+0x08
movq %rcx,%gs:PANIC_REGS+0x10
movq %rdx,%gs:PANIC_REGS+0x18
movq %rsi,%gs:PANIC_REGS+0x20
movq %rdi,%gs:PANIC_REGS+0x28
movq %rbp,%gs:PANIC_REGS+0x30
movq 0x18(%rsp),%rax /* rsp */
movq %rax,%gs:PANIC_REGS+0x38
movq %r8, %gs:PANIC_REGS+0x40
movq %r9, %gs:PANIC_REGS+0x48
movq %r10,%gs:PANIC_REGS+0x50
movq %r11,%gs:PANIC_REGS+0x58
movq %r12,%gs:PANIC_REGS+0x60
movq %r13,%gs:PANIC_REGS+0x68
movq %r14,%gs:PANIC_REGS+0x70
movq %r15,%gs:PANIC_REGS+0x78
movq 0x00(%rsp),%rax /* rip */
movq %rax,%gs:PANIC_REGS+0x80
movq 0x10(%rsp),%rax /* rflags */
movl %eax,%gs:PANIC_REGS+0x88
movq 0x08(%rsp),%rax /* cs */
movl %eax,%gs:PANIC_REGS+0x8C
movq 0x20(%rsp),%rax /* ss */
movl %eax,%gs:PANIC_REGS+0x90
xorq %rax,%rax
movw %ds,%ax
movl %eax,%gs:PANIC_REGS+0x94
movw %es,%ax
movl %eax,%gs:PANIC_REGS+0x98
movw %fs,%ax
movl %eax,%gs:PANIC_REGS+0x9C
movw %gs,%ax
movl %eax,%gs:PANIC_REGS+0xA0
movq $1,%gs:PANICED
1:
hlt
jmp 1b
.globl x86_syscall
x86_syscall:
cld
movq %rsp, %gs:24
movq %rsp, %gs:X86_CPU_LOCAL_OFFSET_USTACK
movq %gs:(X86_CPU_LOCAL_OFFSET_SP0), %rsp
pushq $(USER_DS)
@ -124,21 +184,19 @@ x86_syscall:
pushq %r11
pushq $(USER_CS)
pushq %rcx
pushq $0
movq %gs:24, %rcx
movq %rcx, 32(%rsp)
pushq %rax /* error code (= system call number) */
PUSH_ALL_REGS
movq 104(%rsp), %rdi
movq %gs:X86_CPU_LOCAL_OFFSET_USTACK, %rcx
movq %rcx, RSP_OFFSET(%rsp)
movq RAX_OFFSET(%rsp), %rdi
movw %ss, %ax
movw %ax, %ds
movq %rsp, %rsi
callq *__x86_syscall_handler(%rip)
1:
movq %rax, 104(%rsp)
movq %rax, RAX_OFFSET(%rsp)
POP_ALL_REGS
#ifdef USE_SYSRET
movq 8(%rsp), %rcx
movq 24(%rsp), %r11
movq 32(%rsp), %rsp
sysretq
#else
@ -147,7 +205,35 @@ x86_syscall:
#endif
.globl enter_user_mode
enter_user_mode:
enter_user_mode:
callq release_runq_lock
movq $0, %rdi
movq %rsp, %rsi
call check_signal
movq $0, %rdi
call set_cputime
POP_ALL_REGS
addq $8, %rsp
iretq
.globl debug_exception
debug_exception:
cld
pushq $0 /* error */
PUSH_ALL_REGS
movq %rsp, %rdi
call debug_handler
POP_ALL_REGS
addq $8, %rsp
iretq
.globl int3_exception
int3_exception:
cld
pushq $0 /* error */
PUSH_ALL_REGS
movq %rsp, %rdi
call int3_handler
POP_ALL_REGS
addq $8, %rsp
iretq

View File

@ -6,6 +6,8 @@
* resides in memory.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -19,26 +21,37 @@
#include <registers.h>
#include <string.h>
#define LOCALS_SPAN (4 * PAGE_SIZE)
struct x86_cpu_local_variables *locals;
size_t x86_cpu_local_variables_span = LOCALS_SPAN; /* for debugger */
void init_processors_local(int max_id)
{
size_t size;
size = LOCALS_SPAN * max_id;
/* Is contiguous allocating adequate?? */
locals = ihk_mc_alloc_pages(max_id, IHK_MC_AP_CRITICAL);
memset(locals, 0, PAGE_SIZE * max_id);
locals = ihk_mc_alloc_pages(size/PAGE_SIZE, IHK_MC_AP_CRITICAL);
memset(locals, 0, size);
kprintf("locals = %p\n", locals);
}
/*@
@ requires \valid(id);
@ ensures \result == locals + (LOCALS_SPAN * id);
@ assigns \nothing;
@*/
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id)
{
return (struct x86_cpu_local_variables *)
((char *)locals + (id << PAGE_SHIFT));
((char *)locals + (LOCALS_SPAN * id));
}
static void *get_x86_cpu_local_kstack(int id)
{
return ((char *)locals + ((id + 1) << PAGE_SHIFT));
return ((char *)locals + (LOCALS_SPAN * (id + 1)));
}
struct x86_cpu_local_variables *get_x86_this_cpu_local(void)
@ -80,7 +93,20 @@ void assign_processor_id(void)
v->processor_id = id;
}
void init_boot_processor_local(void)
{
static struct x86_cpu_local_variables avar;
memset(&avar, -1, sizeof(avar));
set_gs_base(&avar);
return;
}
/** IHK **/
/*@
@ ensures \result == %gs;
@ assigns \nothing;
*/
int ihk_mc_get_processor_id(void)
{
int id;
@ -90,6 +116,10 @@ int ihk_mc_get_processor_id(void)
return id;
}
/*@
@ ensures \result == (locals + (LOCALS_SPAN * %gs))->apic_id;
@ assigns \nothing;
*/
int ihk_mc_get_hardware_processor_id(void)
{
struct x86_cpu_local_variables *v = get_x86_this_cpu_local();

File diff suppressed because it is too large Load Diff

View File

@ -14,17 +14,28 @@
#include <registers.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
void x86_init_perfctr(void)
{
unsigned long reg;
unsigned long value = 0;
/* Do not do it on KVM */
if (running_on_kvm()) return;
/* Allow PMC to be read from user space */
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -33,20 +44,51 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
return -EINVAL;
}
if (mode & PERFCTR_USER_MODE) {
// clear mode flags
value &= ~(3 << 16);
// set mode flags
if(mode & PERFCTR_USER_MODE) {
value |= 1 << 16;
}
if (mode & PERFCTR_KERNEL_MODE) {
}
if(mode & PERFCTR_KERNEL_MODE) {
value |= 1 << 17;
}
}
// wrmsr(MSR_PERF_GLOBAL_CTRL, 0);
value |= (1 << 22) | (1 << 18); /* EN */
value |= (1 << 20); /* Enable overflow interrupt */
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, unsigned long val)
{
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
wrmsr(MSR_IA32_PMC0 + counter, val);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// set fixed pmc
wrmsr(MSR_IA32_FIXED_CTR0 + counter - X86_IA32_BASE_FIXED_PERF_COUNTERS, val);
}
else {
return -EINVAL;
}
return 0;
}
@ -57,6 +99,45 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
CVAL2(event, mask, inv, count));
}
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, code);
}
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
@ -78,14 +159,15 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
unsigned int value = 0;
unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
#ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask);
#endif
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask;
value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0;
@ -93,9 +175,10 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
int ihk_mc_perfctr_stop(unsigned long counter_mask)
{
unsigned int value;
unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
@ -103,17 +186,48 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
return 0;
}
int ihk_mc_perfctr_reset(int counter)
// init for fixed counter
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
wrmsr(MSR_IA32_PMC0 + counter, 0);
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_reset(int counter)
{
return set_pmc_x86_direct(counter, 0);
}
int ihk_mc_perfctr_set(int counter, unsigned long val)
{
return set_pmc_x86_direct(counter, val);
}
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
{
int i, j;
@ -129,10 +243,77 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
unsigned long ihk_mc_perfctr_read(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
return rdpmc(counter);
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
retval = rdpmc(counter);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
retval = rdpmc((1 << 30) + (counter - X86_IA32_BASE_FIXED_PERF_COUNTERS));
}
else {
retval = -EINVAL;
}
return retval;
}
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
unsigned int idx = 0;
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
idx = MSR_IA32_PMC0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
idx = MSR_IA32_FIXED_CTR0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else {
retval = -EINVAL;
}
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status)
{
int i = 0;
int ret = -1;
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
pmc_status |= (1 << i);
break;
}
}
if(ret < 0){
return ret;
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,8 @@
* implements x86's vsyscall
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 Hitachi, Ltd.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
@ -16,20 +18,93 @@
*/
#include <syscall.h>
#include <ihk/atomic.h>
#include <arch/cpu.h>
extern int vsyscall_gettimeofday(void *tv, void *tz)
extern int vsyscall_gettimeofday(struct timeval *tv, void *tz)
__attribute__ ((section (".vsyscall.gettimeofday")));
int vsyscall_gettimeofday(void *tv, void *tz)
struct tod_data_s tod_data
__attribute__ ((section(".vsyscall.gettimeofday.data"))) = {
.do_local = 0,
.version = IHK_ATOMIC64_INIT(0),
};
static inline void cpu_pause_for_vsyscall(void)
{
asm volatile ("pause" ::: "memory");
return;
} /* cpu_pause_for_vsyscall() */
static inline void calculate_time_from_tsc(struct timespec *ts)
{
long ver;
unsigned long current_tsc;
__time_t sec_delta;
long ns_delta;
for (;;) {
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
/* settimeofday() is in progress */
cpu_pause_for_vsyscall();
}
rmb();
*ts = tod_data.origin;
rmb();
if (ver == ihk_atomic64_read(&tod_data.version)) {
break;
}
/* settimeofday() has intervened */
cpu_pause_for_vsyscall();
}
current_tsc = rdtsc();
sec_delta = current_tsc / tod_data.clocks_per_sec;
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
ts->tv_sec += sec_delta;
ts->tv_nsec += ns_delta;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
return;
} /* calculate_time_from_tsc() */
int vsyscall_gettimeofday(struct timeval *tv, void *tz)
{
int error;
struct timespec ats;
if (!tv && !tz) {
/* nothing to do */
return 0;
}
/* Do it locally if supported */
if (!tz && tod_data.do_local) {
calculate_time_from_tsc(&ats);
tv->tv_sec = ats.tv_sec;
tv->tv_usec = ats.tv_nsec / 1000;
return 0;
}
/* Otherwise syscall */
asm ("syscall" : "=a" (error)
: "a" (__NR_gettimeofday), "D" (tv), "S" (tz)
: "%rcx", "%r11", "memory");
if (error) {
*(int *)0 = 0; /* i.e. raise(SIGSEGV) */
}
return error;
}
} /* vsyscall_gettimeofday() */
extern long vsyscall_time(void *tp)
__attribute__ ((section (".vsyscall.time")));
@ -58,3 +133,17 @@ long vsyscall_time(void *tp)
return t;
}
extern int vsyscall_getcpu(unsigned *cpup, unsigned *nodep, void *tcachep)
__attribute__ ((section (".vsyscall.getcpu")));
int vsyscall_getcpu(unsigned *cpup, unsigned *nodep, void *tcachep)
{
int error;
asm ("syscall" : "=a" (error)
: "a" (__NR_getcpu), "D" (cpup), "S" (nodep), "d" (tcachep)
: "%rcx", "%r11", "memory");
return error;
}

View File

@ -0,0 +1,46 @@
#!/bin/bash -x
# \file arch/x86/tools/mcreboot-builtin-x86.sh.in
# License details are found in the file LICENSE.
# \brief
# mckernel boot script
# \author Masamichi Takagi <masamichi.takagi@riken.jp> \par
# Copyright (C) 2014 RIKEN AICS
# HISTORY:
#
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
kill -9 `pidof mcexec`
if lsmod | grep mcctrl > /dev/null 2>&1; then
rmmod mcctrl || exit 1
fi
if lsmod | grep dcfa > /dev/null 2>&1; then
rmmod dcfa || exit 1
fi
if lsmod | grep ihk_builtin > /dev/null 2>&1; then
rmmod ihk_builtin || exit 1
fi
if lsmod | grep ihk > /dev/null 2>&1; then
rmmod ihk || exit 1
fi
insmod "$KMODDIR/ihk.ko" &&
insmod "$KMODDIR/ihk_builtin.ko" &&
"$SBINDIR/ihkconfig" 0 create &&
NCORE=`dmesg | grep -E 'SHIMOS: CPU Status:'|awk '{split($0,a," "); for (i = 1; i <= length(a); i++) { if(a[i] ~ /2/) {count++}} print count;}'`
MEM=`free -g | grep -E 'Mem:' | awk '{print int($2/4)}'`
"$SBINDIR/ihkosctl" 0 alloc "$NCORE" "$MEM"g &&
"$SBINDIR/ihkosctl" 0 load "$KERNDIR/mckernel.img" &&
"$SBINDIR/ihkosctl" 0 kargs hidos osnum=0 &&
"$SBINDIR/ihkosctl" 0 boot &&
sleep 1 &&
"$SBINDIR/ihkosctl" 0 kmsg &&
insmod "$KMODDIR/mcctrl.ko" &&
sleep 1 &&
"$SBINDIR/ihkosctl" 0 kmsg &&
exit 0

View File

@ -0,0 +1,199 @@
#!/bin/bash
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it.
# The script reserves half of the CPU cores and 512MB of RAM from NUMA node 0
# when IHK is loaded for the first time, otherwise it destroys the current
# McKernel instance and reboots it using the same set of resources as it used
# previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
INTERVAL=1
LOGMODE=0
while getopts :i:k: OPT
do
case ${OPT} in
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value"
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value"
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value"
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value"
exit 1
fi
;;
*) echo "invalid option -${OPT}"
exit 1
esac
done
mem="512M@0"
cpus=""
ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi`
else
enable_mcoverlay=no
fi
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then echo "error: no available CPUs on NUMA node 0?"; exit; fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay"; exit; fi
fi
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
fi
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then echo "error: no IRQ available"; exit; fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then echo "error: loading ihk-smp-x86"; exit; fi;
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
# If loaded, but no resources allocated, get CPUs and memory
else
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus_allocated=`${SBINDIR}/ihkosctl 0 query cpu`
if [ "$cpus_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then echo "error: reserving CPUs"; exit; fi
fi
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem_allocated=`${SBINDIR}/ihkosctl 0 query mem`
if [ "$mem_allocated" == "" ]; then
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then echo "error: reserving memory"; exit; fi
fi
fi
# Check for existing OS instance and destroy
if [ -c /dev/mcos0 ]; then
# Query CPU cores and memory of OS instance so that the same values are used as previously
if ! ${SBINDIR}/ihkosctl 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkosctl 0 query cpu`
if ! ${SBINDIR}/ihkosctl 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkosctl 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 destroy 0; then echo "warning: destroy failed"; fi
else
# Otherwise query IHK-SMP for resources
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
fi
if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos"; exit; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc"; exit; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko"; exit; fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc"; exit; fi
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys"; exit; fi
mount --make-rprivate /sys
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
fi
done
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL}
fi

View File

@ -0,0 +1,16 @@
#!/bin/bash
# \file arch/x86/tools/mcshutdown-attached-mic.sh.in
# License details are found in the file LICENSE.
# \brief
# mckernel shutdown script
#
# \author McKernel Development Team
#
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
"$SBINDIR/ihkosctl" 0 shutdown

View File

@ -0,0 +1,47 @@
#!/bin/bash
# IHK SMP-x86 example McKernel unload script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2015 RIKEN AICS
#
# This is an example script for destroying McKernel and releasing IHK resources
# Note that the script does no output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit; fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
fi
# Destroy all LWK instances
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then echo "error: destroying LWK instance $ind failed"; exit; fi
done
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then echo "error: querying cpus"; exit; fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then echo "error: releasing CPUs"; exit; fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then echo "error: querying memory"; exit; fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then echo "error: releasing memory"; exit; fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then echo "error: removing ihk_smp_x86"; exit; fi
fi

5013
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -24,13 +24,30 @@ AC_ARG_WITH([kernelsrc],
AC_ARG_WITH([target],
AC_HELP_STRING(
[--with-target={attached-mic | builtin-mic | builtin-x86}],[target, default is attached-mic]),
[--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]),
[WITH_TARGET=$withval],[WITH_TARGET=yes])
AC_ARG_WITH([system_map],
AS_HELP_STRING(
[--with-system_map=path],[Path to 'System.map file', default is /boot/System.map-uname_r]),
[WITH_SYSTEM_MAP=$withval],[WITH_SYSTEM_MAP=yes])
AC_ARG_ENABLE([dcfa],
[AS_HELP_STRING(
[--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no])
AC_ARG_ENABLE([memdump],
AC_HELP_STRING([--enable-memdump],
[enable dumping memory and analyzing a dump]),
[ENABLE_MEMDUMP=$enableval],
[ENABLE_MEMDUMP=default])
AC_ARG_ENABLE([mcoverlayfs],
AC_HELP_STRING([--enable-mcoverlayfs],
[enable mcoverlayfs implementation]),
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
case "X$WITH_KERNELSRC" in
Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@ -49,9 +66,26 @@ fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
case $WITH_TARGET in
attached-mic)
attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
esac
case $WITH_TARGET in
attached-mic)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -69,12 +103,6 @@ case $WITH_TARGET in
fi
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -92,9 +120,6 @@ case $WITH_TARGET in
fi
;;
builtin-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -111,6 +136,23 @@ case $WITH_TARGET in
MANDIR="$prefix/attached/man"
fi
;;
smp-x86)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/smp-x86/kernel"
fi
if test "X$BINDIR" = X; then
BINDIR="$prefix/bin"
fi
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-x86/man"
fi
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
@ -119,6 +161,116 @@ esac
KDIR="$WITH_KERNELSRC"
TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
case "X$WITH_SYSTEM_MAP" in
Xyes | Xno | X)
MCCTRL_LINUX_SYMTAB=""
;;
*)
MCCTRL_LINUX_SYMTAB="$WITH_SYSTEM_MAP"
;;
esac
AC_MSG_CHECKING([[for System.map]])
if test -f "$MCCTRL_LINUX_SYMTAB"; then
MCCTRL_LINUX_SYMTAB="$MCCTRL_LINUX_SYMTAB"
elif test -f "/boot/System.map-`uname -r`"; then
MCCTRL_LINUX_SYMTAB="/boot/System.map-`uname -r`"
elif test -f "$KDIR/System.map"; then
MCCTRL_LINUX_SYMTAB="$KDIR/System.map"
fi
if test "$MCCTRL_LINUX_SYMTAB" == ""; then
AC_MSG_ERROR([could not find])
fi
if test -z "`eval cat $MCCTRL_LINUX_SYMTAB`"; then
AC_MSG_ERROR([could not read System.map file, no read permission?])
fi
AC_MSG_RESULT([$MCCTRL_LINUX_SYMTAB])
MCCTRL_LINUX_SYMTAB_CMD="cat $MCCTRL_LINUX_SYMTAB"
# MCCTRL_FIND_KSYM(SYMBOL)
# ------------------------------------------------------
# Search System.map for address of the given symbol and
# do one of three things in config.h:
# If not found, leave MCCTRL_KSYM_foo undefined
# If found to be exported, "#define MCCTRL_KSYM_foo 0"
# If found not to be exported, "#define MCCTRL_KSYM_foo 0x<value>"
AC_DEFUN([MCCTRL_FIND_KSYM],[
AC_MSG_CHECKING([[System.map for symbol $1]])
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " $1\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
AC_MSG_RESULT([not found])
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
m4_ifval([$2],[],[
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_$1\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
])
AC_MSG_RESULT([$mcctrl_result])
AC_DEFINE_UNQUOTED(MCCTRL_KSYM_[]$1,$mcctrl_addr,[Define to address of kernel symbol $1, or 0 if exported])
fi
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
MCCTRL_FIND_KSYM([vdso_start])
MCCTRL_FIND_KSYM([vdso_end])
MCCTRL_FIND_KSYM([vdso_pages])
MCCTRL_FIND_KSYM([__vvar_page])
MCCTRL_FIND_KSYM([hpet_address])
MCCTRL_FIND_KSYM([hv_clock])
MCCTRL_FIND_KSYM([sys_readlink])
case $ENABLE_MEMDUMP in
yes|no|auto)
;;
default)
if test "x$WITH_TARGET" = "xsmp-x86" ; then
ENABLE_MEMDUMP=auto
else
ENABLE_MEMDUMP=no
fi
;;
*)
AC_MSG_ERROR([unknown memdump argument: $ENABLE_MEMDUMP])
;;
esac
if test "x$ENABLE_MEMDUMP" != "xno" ; then
enableval=yes
AC_CHECK_LIB([bfd],[bfd_init],[],[enableval=no])
AC_CHECK_HEADER([bfd.h],[],[enableval=no])
if test "x$ENABLE_MEMDUMP" = "xyes" -a "x$enableval" = "xno" ; then
AC_MSG_ERROR([memdump feature needs bfd.h and libbfd a.k.a bunutils-devel])
fi
ENABLE_MEMDUMP=$enableval
fi
if test "x$ENABLE_MEMDUMP" = "xyes" ; then
AC_MSG_NOTICE([memdump feature is enabled])
AC_DEFINE([ENABLE_MEMDUMP],[1],[whether memdump feature is enabled])
uncomment_if_ENABLE_MEMDUMP=''
else
AC_MSG_NOTICE([memdump feature is disabled])
uncomment_if_ENABLE_MEMDUMP='#'
fi
if test "x$ENABLE_MCOVERLAYFS" = "xyes" ; then
AC_DEFINE([ENABLE_MCOVERLAYFS],[1],[whether mcoverlayfs is enabled])
AC_MSG_NOTICE([mcoverlayfs is enabled])
else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
@ -129,6 +281,7 @@ AC_SUBST(SBINDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION)
@ -136,15 +289,23 @@ AC_SUBST(DCFA_VERSION)
AC_SUBST(IHK_RELEASE_DATE)
AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_FILES([
Makefile
executer/user/Makefile
executer/kernel/Makefile
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
arch/x86/tools/mcshutdown-attached-mic.sh
arch/x86/tools/mcreboot-builtin-x86.sh
arch/x86/tools/mcreboot-smp-x86.sh
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
])

91
executer/config.h.in Normal file
View File

@ -0,0 +1,91 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bfd' library (-lbfd). */
#undef HAVE_LIBBFD
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to address of kernel symbol __vvar_page, or 0 if exported */
#undef MCCTRL_KSYM___vvar_page
/* Define to address of kernel symbol hpet_address, or 0 if exported */
#undef MCCTRL_KSYM_hpet_address
/* Define to address of kernel symbol hv_clock, or 0 if exported */
#undef MCCTRL_KSYM_hv_clock
/* Define to address of kernel symbol sys_mount, or 0 if exported */
#undef MCCTRL_KSYM_sys_mount
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
/* Define to address of kernel symbol vdso_end, or 0 if exported */
#undef MCCTRL_KSYM_vdso_end
/* Define to address of kernel symbol vdso_image_64, or 0 if exported */
#undef MCCTRL_KSYM_vdso_image_64
/* Define to address of kernel symbol vdso_pages, or 0 if exported */
#undef MCCTRL_KSYM_vdso_pages
/* Define to address of kernel symbol vdso_start, or 0 if exported */
#undef MCCTRL_KSYM_vdso_start
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

View File

@ -38,6 +38,9 @@
#define MCEXEC_UP_SEND_SIGNAL 0x30a02906
#define MCEXEC_UP_GET_CPU 0x30a02907
#define MCEXEC_UP_STRNCPY_FROM_USER 0x30a02908
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@ -45,6 +48,11 @@
#define MCEXEC_UP_OPEN_EXEC 0x30a02912
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_DEBUG_LOG 0x40000000
#define MCEXEC_UP_TRANSFER_TO_REMOTE 0
#define MCEXEC_UP_TRANSFER_FROM_REMOTE 1
@ -67,6 +75,7 @@ struct program_image_section {
};
#define SHELL_PATH_MAX_LEN 1024
#define MCK_RLIM_MAX 20
struct program_load_desc {
int num_sections;
@ -76,6 +85,10 @@ struct program_load_desc {
int err;
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -90,8 +103,7 @@ struct program_load_desc {
unsigned long args_len;
char *envs;
unsigned long envs_len;
unsigned long rlimit_stack_cur;
unsigned long rlimit_stack_max;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
char shell_path[SHELL_PATH_MAX_LEN];
struct program_image_section sections[0];
@ -156,4 +168,20 @@ struct signal_desc {
char info[128];
};
struct newprocess_desc {
int pid;
};
struct sys_mount_desc {
char *dev_name;
char *dir_name;
char *type;
unsigned long flags;
void *data;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
#endif

View File

@ -1,25 +0,0 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
IHK_BASE=$(src)/../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/include -I$(src)/../include
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1,27 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1 @@
# dummy file

View File

@ -0,0 +1,194 @@
#include <linux/version.h>
#include "../../../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
#if MCCTRL_KSYM_vdso_image_64
struct vdso_image *vdso_image = (void *)MCCTRL_KSYM_vdso_image_64;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_start
#if MCCTRL_KSYM_vdso_start
void *vdso_start = (void *)MCCTRL_KSYM_vdso_start;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_end
#if MCCTRL_KSYM_vdso_end
void *vdso_end = (void *)MCCTRL_KSYM_vdso_end;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_pages
#if MCCTRL_KSYM_vdso_pages
struct page **vdso_pages = (void *)MCCTRL_KSYM_vdso_pages;
#endif
#endif
#ifdef MCCTRL_KSYM___vvar_page
#if MCCTRL_KSYM___vvar_page
void *__vvar_page = (void *)MCCTRL_KSYM___vvar_page;
#endif
#endif
long *hpet_addressp
#ifdef MCCTRL_KSYM_hpet_address
#if MCCTRL_KSYM_hpet_address
= (void *)MCCTRL_KSYM_hpet_address;
#else
= &hpet_address;
#endif
#else
= NULL;
#endif
void **hv_clockp
#ifdef MCCTRL_KSYM_hv_clock
#if MCCTRL_KSYM_hv_clock
= (void *)MCCTRL_KSYM_hv_clock;
#else
= &hv_clock;
#endif
#else
= NULL;
#endif
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
int
reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
{
struct vm_area_struct *vma;
unsigned long start = 0L;
unsigned long end;
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
start = reserve_user_space_common(usrdata, start, end);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
if (IS_ERR_VALUE(start)) {
return start;
}
*startp = start;
*endp = end;
return 0;
}
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
long vdso_pa;
struct vdso *vdso;
size_t size;
int i;
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
memset(vdso, 0, sizeof(*vdso));
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = virt_to_phys(
vdso_image->data + (i * PAGE_SIZE));
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
size = vdso_end - vdso_start;
size = (size + PAGE_SIZE - 1) & PAGE_MASK;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]);
}
#endif
/* VVAR page */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-3 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-2 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
vdso->vvar_is_global = 1;
vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#endif
/* HPET page */
if (hpet_addressp && *hpet_addressp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-2 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-1 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
vdso->hpet_is_global = 1;
vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET);
vdso->hpet_phys = *hpet_addressp;
#endif
}
/* struct pvlock_vcpu_time_info table */
if (hv_clockp && *hv_clockp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->pvti_is_global = 0;
vdso->pvti_virt = (void *)(-1 * PAGE_SIZE);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
vdso->pvti_is_global = 1;
vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#endif
}
out:
wmb();
vdso->busy = 0;
ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso));
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */

View File

@ -0,0 +1,261 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/binfmts.h>
#include <linux/elfcore.h>
#include <linux/elf.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/version.h>
#include "mcctrl.h"
static int pathcheck(const char *file, const char *list)
{
const char *p;
const char *q;
const char *r;
int l;
if(!*list)
return 1;
p = list;
do{
q = strchr(p, ':');
if(!q)
q = strchr(p, '\0');
for(r = q - 1; r >= p && *r == '/'; r--);
l = r - p + 1;
if(!strncmp(file, p, l) &&
file[l] == '/')
return 1;
p = q + 1;
} while(*q);
return 0;
}
static int load_elf(struct linux_binprm *bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, struct pt_regs *regs
#endif
)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
const
#endif
char *wp;
char *cp;
struct file *file;
int rc;
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
typedef struct {
char *name;
char *val;
int l;
} envdata;
envdata env[] = {
{.name = "MCEXEC_WL"},
#define env_mcexec_wl (env[0].val)
{.name = NULL}
};
envdata *ep;
unsigned long off = 0;
struct page *page;
char *addr = NULL;
int i;
unsigned long p;
int st;
int mode;
int cnt[2];
char buf[32];
int l;
int pass;
char pbuf[1024];
const char *path;
if(bprm->envc == 0)
return -ENOEXEC;
if(memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
return -ENOEXEC;
if(elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
return -ENOEXEC;
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
cp = strrchr(path, '/');
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig"))
return -ENOEXEC;
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
for(pass = 0; pass < 2; pass++){
p = bprm->p;
mode = cnt[0] == 0? 1: 0;
if(pass == 1){
for(ep = env; ep->name; ep++){
if(ep->l)
ep->val = kmalloc(ep->l, GFP_KERNEL);
}
}
ep = NULL;
l = 0;
for(i = 0, st = 0; mode != 2;){
if(st == 0){
off = p & ~PAGE_MASK;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
rc = get_user_pages_remote(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#else
rc = get_user_pages(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0)
return -EFAULT;
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
#endif
);
st = 1;
}
if(addr[off]){
if(mode == 1){
if(ep){
if(pass == 1)
ep->val[l] = addr[off];
l++;
}
else if(addr[off] == '='){
if(l < 32)
buf[l] = '\0';
buf[31] = '\0';
for(ep = env; ep->name; ep++)
if(!strcmp(ep->name, buf))
break;
if(ep->name)
l = 0;
else
ep = NULL;
}
else{
if(l < 32)
buf[l] = addr[off];
l++;
}
}
}
else{
if(mode == 1 && ep){
if(pass == 0){
ep->l = l + 1;
}
else{
ep->val[l] = '\0';
}
}
ep = NULL;
l = 0;
i++;
if(i == cnt[mode]){
i = 0;
mode++;
}
}
off++;
p++;
if(off == PAGE_SIZE || mode == 2){
kunmap_atomic(addr
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
#endif
);
put_page(page);
st = 0;
}
}
}
if(env_mcexec_wl)
rc = !pathcheck(path, env_mcexec_wl);
else
rc = 1;
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc)
return -ENOEXEC;
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file))
return -ENOEXEC;
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
return rc;
}
bprm->argc++;
wp = MCEXEC_PATH;
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
return rc;
}
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = file;
rc = prepare_binprm(bprm);
if (rc < 0){
return rc;
}
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs
#endif
);
}
static struct linux_binfmt mcexec_format = {
.module = THIS_MODULE,
.load_binary = load_elf,
};
void __init binfmt_mcexec_init(void)
{
insert_binfmt(&mcexec_format);
}
void __exit binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

View File

@ -31,18 +31,43 @@
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/version.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/msr.h>
#include <asm/io.h>
#include "../../config.h"
#include "mcctrl.h"
//#define DEBUG
#ifdef DEBUG
#define dprintk printk
#else
#define dprintk(...)
#endif
#ifdef MCCTRL_KSYM_sys_unshare
#if MCCTRL_KSYM_sys_unshare
typedef int (*int_star_fn_ulong_t)(unsigned long);
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) =
(int_star_fn_ulong_t)
MCCTRL_KSYM_sys_unshare;
#else // exported
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) = NULL;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_mount
#if MCCTRL_KSYM_sys_mount
typedef int (*int_star_fn_char_char_char_ulong_void_t)(char *, char *, char *, unsigned long, void *);
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) =
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
#endif
#endif
//static DECLARE_WAIT_QUEUE_HEAD(wq_prepare);
//extern struct mcctrl_channel *channels;
int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
@ -99,10 +124,10 @@ static long mcexec_prepare_image(ihk_os_t os,
pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %d\n", *(int*)args);
printk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %d\n", *(int*)envs);
printk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu;
@ -242,19 +267,72 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
//extern unsigned long last_thread_exec;
struct handlerinfo {
int pid;
};
static long mcexec_debug_log(ihk_os_t os, unsigned long arg)
{
struct ikc_scd_packet isp;
memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_DEBUG_LOG;
isp.arg = arg;
mcctrl_ikc_send(os, 0, &isp);
return 0;
}
static void release_handler(ihk_os_t os, void *param)
{
struct handlerinfo *info = param;
struct ikc_scd_packet isp;
int os_ind = ihk_host_os_get_index(os);
memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_CLEANUP_PROCESS;
isp.pid = info->pid;
mcctrl_ikc_send(os, 0, &isp);
if(os_ind >= 0)
delete_pid_entry(os_ind, info->pid);
kfree(param);
}
static long mcexec_newprocess(ihk_os_t os,
struct newprocess_desc *__user udesc,
struct file *file)
{
struct newprocess_desc desc;
struct handlerinfo *info;
if (copy_from_user(&desc, udesc, sizeof(struct newprocess_desc))) {
return -EFAULT;
}
info = kmalloc(sizeof(struct handlerinfo), GFP_KERNEL);
info->pid = desc.pid;
ihk_os_register_release_handler(file, release_handler, info);
return 0;
}
static long mcexec_start_image(ihk_os_t os,
struct program_load_desc * __user udesc)
struct program_load_desc * __user udesc,
struct file *file)
{
struct program_load_desc desc;
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct handlerinfo *info;
if (copy_from_user(&desc, udesc,
sizeof(struct program_load_desc))) {
return -EFAULT;
}
info = kmalloc(sizeof(struct handlerinfo), GFP_KERNEL);
info->pid = desc.pid;
ihk_os_register_release_handler(file, release_handler, info);
c = usrdata->channels + desc.cpu;
mcctrl_ikc_set_recv_cpu(os, desc.cpu);
@ -366,10 +444,10 @@ retry_alloc:
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
}
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
return 0;
}
@ -427,7 +505,7 @@ retry_alloc:
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == current->tgid) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
@ -439,22 +517,23 @@ retry_alloc:
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
if (ret) {
return -EINTR;
}
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
if (ret && !wqhln->req) {
kfree(wqhln);
return -EINTR;
}
kfree(wqhln);
if (c->param.request_va->number == 61 &&
c->param.request_va->args[0] == swd.pid) {
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
current->tgid,
current->pid,
task_tgid_vnr(current),
task_pid_vnr(current);
c->param.request_va->number,
swd.cpu);
@ -471,12 +550,12 @@ printk("mcexec_wait_syscall:stray wakeup\n");
#else
while (1) {
c = usrdata->channels + swd.cpu;
rdtscll(s);
ihk_get_tsc(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
rdtscll(w);
ihk_get_tsc(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
@ -723,7 +802,7 @@ long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
}
LIST_HEAD(mckernel_exec_files);
spinlock_t mckernel_exec_file_lock = SPIN_LOCK_UNLOCKED;
DEFINE_SPINLOCK(mckernel_exec_file_lock);
struct mckernel_exec_file {
@ -733,17 +812,75 @@ struct mckernel_exec_file {
struct list_head list;
};
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
#define GUIDVAL(x) (x)
#else
#define GUIDVAL(x) ((x).val)
#endif
int
mcexec_getcred(unsigned long phys)
{
int *virt = phys_to_virt(phys);
virt[0] = GUIDVAL(current_uid());
virt[1] = GUIDVAL(current_euid());
virt[2] = GUIDVAL(current_suid());
virt[3] = GUIDVAL(current_fsuid());
virt[4] = GUIDVAL(current_gid());
virt[5] = GUIDVAL(current_egid());
virt[6] = GUIDVAL(current_sgid());
virt[7] = GUIDVAL(current_fsgid());
return 0;
}
int
mcexec_getcredv(int __user *virt)
{
int wk[8];
wk[0] = GUIDVAL(current_uid());
wk[1] = GUIDVAL(current_euid());
wk[2] = GUIDVAL(current_suid());
wk[3] = GUIDVAL(current_fsuid());
wk[4] = GUIDVAL(current_gid());
wk[5] = GUIDVAL(current_egid());
wk[6] = GUIDVAL(current_sgid());
wk[7] = GUIDVAL(current_fsgid());
if(copy_to_user(virt, wk, sizeof(int) * 8))
return -EFAULT;
return 0;
}
int mcexec_open_exec(ihk_os_t os, char * __user filename)
{
struct file *file;
struct mckernel_exec_file *mcef;
struct mckernel_exec_file *mcef_iter;
int retval;
int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath;
if (os_ind < 0) {
return EINVAL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
return ENOMEM;
}
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file)) {
goto out_return;
goto out_error_free;
}
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (IS_ERR(fullpath)) {
retval = PTR_ERR(fullpath);
goto out_error_free;
}
mcef = kmalloc(sizeof(*mcef), GFP_KERNEL);
@ -755,30 +892,37 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
spin_lock_irq(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == current->tgid) {
if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
allow_write_access(mcef_iter->fp);
fput(mcef_iter->fp);
list_del(&mcef_iter->list);
kfree(mcef_iter);
dprintk("%d open_exec dropped previous executable \n", (int)current->tgid);
break;
}
}
/* Add new exec file to the list */
mcef->os = os;
mcef->pid = current->tgid;
mcef->pid = task_tgid_vnr(current);
mcef->fp = file;
list_add_tail(&mcef->list, &mckernel_exec_files);
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
spin_unlock(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)current->tgid, filename);
dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current), filename);
kfree(pathbuf);
return 0;
out_put_file:
fput(file);
out_return:
out_error_free:
kfree(pathbuf);
return -retval;
}
@ -787,19 +931,25 @@ int mcexec_close_exec(ihk_os_t os)
{
struct mckernel_exec_file *mcef = NULL;
int found = 0;
int os_ind = ihk_host_os_get_index(os);
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irq(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == current->tgid) {
if (mcef->os == os && mcef->pid == task_tgid_vnr(current)) {
allow_write_access(mcef->fp);
fput(mcef->fp);
list_del(&mcef->list);
kfree(mcef);
found = 1;
dprintk("%d close_exec dropped executable \n", (int)current->tgid);
dprintk("%d close_exec dropped executable \n", (int)task_tgid_vnr(current));
break;
}
}
spin_unlock(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
@ -857,7 +1007,69 @@ long mcexec_strncpy_from_user(ihk_os_t os, struct strncpy_from_user_desc * __use
return 0;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg)
long mcexec_sys_mount(struct sys_mount_desc *__user arg)
{
struct sys_mount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_unshare
ret = mcctrl_sys_unshare(desc.unshare_flags);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
struct file *file)
{
switch (req) {
case MCEXEC_UP_PREPARE_IMAGE:
@ -867,7 +1079,7 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg)
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
case MCEXEC_UP_START_IMAGE:
return mcexec_start_image(os, (struct program_load_desc *)arg);
return mcexec_start_image(os, (struct program_load_desc *)arg, file);
case MCEXEC_UP_WAIT_SYSCALL:
return mcexec_wait_syscall(os, (struct syscall_wait_desc *)arg);
@ -888,6 +1100,10 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg)
return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg);
case MCEXEC_UP_NEW_PROCESS:
return mcexec_newprocess(os, (struct newprocess_desc *)arg,
file);
case MCEXEC_UP_OPEN_EXEC:
return mcexec_open_exec(os, (char *)arg);
@ -899,6 +1115,21 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg)
case MCEXEC_UP_FREE_DMA:
return mcexec_free_region(os, (unsigned long *)arg);
case MCEXEC_UP_GET_CRED:
return mcexec_getcred((unsigned long)arg);
case MCEXEC_UP_GET_CREDV:
return mcexec_getcredv((int *)arg);
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);
case MCEXEC_UP_DEBUG_LOG:
return mcexec_debug_log(os, arg);
}
return -EINVAL;
}

View File

@ -25,11 +25,13 @@
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h"
#define OS_MAX_MINOR 64
extern long __mcctrl_control(ihk_os_t, unsigned int, unsigned long);
extern long __mcctrl_control(ihk_os_t, unsigned int, unsigned long,
struct file *);
extern int prepare_ikc_channels(ihk_os_t os);
extern void destroy_ikc_channels(ihk_os_t os);
#ifndef DO_USER_MODE
@ -38,11 +40,15 @@ extern void mcctrl_syscall_init(void);
extern void procfs_init(int);
extern void procfs_exit(int);
extern void rus_page_hash_init(void);
extern void rus_page_hash_put_pages(void);
extern void binfmt_mcexec_init(void);
extern void binfmt_mcexec_exit(void);
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
unsigned long arg)
unsigned long arg, struct file *file)
{
return __mcctrl_control(os, request, arg);
return __mcctrl_control(os, request, arg, file);
}
static struct ihk_os_user_call_handler mcctrl_uchs[] = {
@ -55,10 +61,16 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_FREE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_OPEN_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
static struct ihk_os_user_call mcctrl_uc_proto = {
@ -70,6 +82,12 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
ihk_os_t
osnum_to_os(int n)
{
return os[n];
}
static int __init mcctrl_init(void)
{
int i;
@ -101,6 +119,8 @@ static int __init mcctrl_init(void)
mcctrl_syscall_init();
#endif
rus_page_hash_init();
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
@ -113,6 +133,8 @@ static int __init mcctrl_init(void)
}
}
binfmt_mcexec_init();
return 0;
}
@ -120,14 +142,19 @@ static void __exit mcctrl_exit(void)
{
int i;
binfmt_mcexec_exit();
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
sysfsm_cleanup(os[i]);
free_topology_info(os[i]);
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
}
rus_page_hash_put_pages();
}
MODULE_LICENSE("GPL v2");

View File

@ -41,9 +41,6 @@
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg);
void procfs_delete(void *__os, int osnum, unsigned long arg);
void procfs_answer(unsigned long arg, int err);
void sig_done(unsigned long arg, int err);
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
@ -69,14 +66,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_CREATE:
procfs_create(__os, pisp->ref, pisp->osnum, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_DELETE:
procfs_delete(__os, pisp->osnum, pisp->arg);
break;
case SCD_MSG_PROCFS_ANSWER:
procfs_answer(pisp->arg, pisp->err);
break;
@ -84,6 +73,42 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_SEND_SIGNAL:
sig_done(pisp->arg, pisp->err);
break;
case SCD_MSG_SYSFS_REQ_CREATE:
case SCD_MSG_SYSFS_REQ_MKDIR:
case SCD_MSG_SYSFS_REQ_SYMLINK:
case SCD_MSG_SYSFS_REQ_LOOKUP:
case SCD_MSG_SYSFS_REQ_UNLINK:
case SCD_MSG_SYSFS_REQ_SETUP:
case SCD_MSG_SYSFS_RESP_SHOW:
case SCD_MSG_SYSFS_RESP_STORE:
case SCD_MSG_SYSFS_RESP_RELEASE:
sysfsm_packet_handler(__os, pisp->msg, pisp->err,
pisp->sysfs_arg1, pisp->sysfs_arg2);
break;
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
pisp->msg, pisp->ref, pisp->osnum, pisp->pid,
pisp->err, pisp->arg);
break;
}
return 0;
}
@ -325,6 +350,9 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
@ -368,6 +396,7 @@ void destroy_ikc_channels(ihk_os_t os)
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
destroy_peer_channel_registry(usrdata);
kfree(usrdata->channels);
kfree(usrdata);
}

View File

@ -32,11 +32,17 @@
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <linux/fs.h>
#include <ihk/ihk_host_driver.h>
#include <linux/resource.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/threads.h>
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
@ -48,12 +54,42 @@
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
@ -67,11 +103,24 @@ struct coretable {
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
};
struct mcctrl_priv {
@ -125,6 +174,62 @@ struct mcctrl_per_proc_data {
unsigned long rpgtable; /* per process, not per OS */
};
struct sysfsm_req {
int busy;
int padding;
long lresult;
wait_queue_head_t wq;
};
struct sysfsm_data {
size_t sysfs_bufsize;
void *sysfs_buf;
long sysfs_buf_rpa;
long sysfs_buf_pa;
struct kobject *sysfs_kobj;
struct sysfsm_node *sysfs_root;
struct semaphore sysfs_tree_sem;
struct semaphore sysfs_io_sem;
struct sysfsm_req sysfs_req;
ihk_os_t sysfs_os;
};
static inline int sysfs_inited(struct sysfsm_data *sdp)
{
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
struct list_head chain;
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
struct ihk_cpu_topology *saved;
cpumask_t core_siblings;
cpumask_t thread_siblings;
struct list_head chain;
struct list_head cache_list;
};
struct node_topology {
struct ihk_node_topology *saved;
cpumask_t cpumap;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
@ -143,6 +248,14 @@ struct mcctrl_usrdata {
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
struct mcctrl_signal {
@ -156,11 +269,12 @@ struct mcctrl_signal {
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
@ -176,6 +290,7 @@ struct procfs_read {
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
@ -185,4 +300,51 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
void procfs_answer(unsigned int arg, int err);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);
void delete_pid_entry(int osnum, int pid);
void proc_exe_link(int osnum, int pid, const char *path);
void procfs_init(int osnum);
void procfs_exit(int osnum);
/* sysfs_files.c */
void setup_sysfs_files(ihk_os_t os);
void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
void get_vdso_info(ihk_os_t os, long vdso_pa);
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
wait_queue_head_t wq;
};
#endif

View File

@ -0,0 +1,791 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include "mcctrl.h"
#include <linux/version.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
typedef uid_t kuid_t;
typedef gid_t kgid_t;
#endif
struct procfs_entry {
char *name;
mode_t mode;
const struct file_operations *fops;
};
#define NOD(NAME, MODE, FOP) { \
.name = (NAME), \
.mode = MODE, \
.fops = FOP, \
}
#define PROC_DIR(NAME, MODE) \
NOD(NAME, (S_IFDIR|(MODE)), NULL)
#define PROC_REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), fops)
#define PROC_TERM \
NOD(NULL, 0, NULL)
static const struct procfs_entry tid_entry_stuff[];
static const struct procfs_entry pid_entry_stuff[];
static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
struct list_head children;
int osnum;
char *data;
char name[0];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
{
char *w = buf + bufsize - 1;
*w = '\0';
for(;;){
int l = strlen(e->name);
w -= l;
memcpy(w, e->name, l);
e = e->parent;
if(!e)
return w;
w--;
*w = '/';
}
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void
procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
static struct procfs_list_entry *
find_procfs_entry(struct procfs_list_entry *parent, const char *name)
{
struct list_head *list;
struct procfs_list_entry *e;
if(parent == NULL)
list = &procfs_file_list;
else
list = &parent->children;
list_for_each_entry(e, list, list) {
if(!strcmp(e->name, name))
return e;
}
return NULL;
}
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *n;
list_del(&top->list);
list_for_each_entry_safe(e, n, &top->children, list) {
delete_procfs_entries(e);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)
kfree(top->data);
kfree(top);
}
static struct procfs_list_entry *
add_procfs_entry(struct procfs_list_entry *parent, const char *name, int mode,
kuid_t uid, kgid_t gid, const void *opaque)
{
struct procfs_list_entry *e = find_procfs_entry(parent, name);
struct proc_dir_entry *pde;
struct proc_dir_entry *parent_pde = NULL;
int f_mode = mode & 0777;
if(e)
delete_procfs_entries(e);
e = kmalloc(sizeof(struct procfs_list_entry) + strlen(name) + 1,
GFP_KERNEL);
if(!e){
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
memset(e, '\0', sizeof(struct procfs_list_entry));
INIT_LIST_HEAD(&e->children);
strcpy(e->name, name);
if(parent)
parent_pde = parent->entry;
if (mode & S_IFDIR) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent_pde);
#else
pde = proc_mkdir_data(name, f_mode, parent_pde, e);
#endif
}
else if ((mode & S_IFLNK) == S_IFLNK) {
pde = proc_symlink(name, parent_pde, (char *)opaque);
}
else {
const struct file_operations *fop;
if(opaque)
fop = (const struct file_operations *)opaque;
else if(mode & S_IWUSR)
fop = &mckernel_forward;
else
fop = &mckernel_forward_ro;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, f_mode, parent_pde);
if(pde)
pde->proc_fops = fop;
#else
pde = proc_create_data(name, f_mode, parent_pde, fop, e);
if(pde)
proc_set_user(pde, uid, gid);
#endif
}
if(!pde){
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", name);
kfree(e);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->uid = uid;
pde->gid = gid;
pde->data = e;
#endif
if(parent)
e->osnum = parent->osnum;
e->entry = pde;
e->parent = parent;
list_add(&(e->list), parent? &(parent->children): &procfs_file_list);
return e;
}
static void
add_procfs_entries(struct procfs_list_entry *parent,
const struct procfs_entry *entries, kuid_t uid, kgid_t gid)
{
const struct procfs_entry *p;
for(p = entries; p->name; p++){
add_procfs_entry(parent, p->name, p->mode, uid, gid, p->fops);
}
}
static const struct cred *
get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
return __task_cred(task);
}
}
return NULL;
}
static struct procfs_list_entry *
find_base_entry(int osnum)
{
char name[12];
sprintf(name, "mcos%d", osnum);
return find_procfs_entry(NULL, name);
}
static struct procfs_list_entry *
find_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_base_entry(osnum)))
return NULL;
sprintf(name, "%d", pid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
find_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_pid_entry(osnum, pid)))
return NULL;
if(!(e = find_procfs_entry(e, "task")))
return NULL;
sprintf(name, "%d", tid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
get_base_entry(int osnum)
{
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
e = find_procfs_entry(NULL, name);
if(!e){
e = add_procfs_entry(NULL, name, S_IFDIR | 0555,
uid, gid, NULL);
e->osnum = osnum;
}
return e;
}
static struct procfs_list_entry *
get_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static struct procfs_list_entry *
get_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
if(!(parent = find_procfs_entry(parent, name)))
return NULL;
if(!(parent = find_procfs_entry(parent, "task")))
return NULL;
sprintf(name, "%d", tid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static void
_add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *exe;
parent = get_tid_entry(osnum, pid, tid);
if(parent){
add_procfs_entries(parent, tid_entry_stuff,
cred->uid, cred->gid);
exe = find_procfs_entry(parent->parent->parent, "exe");
if(exe){
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
cred->uid, cred->gid, exe->data);
}
}
}
void
add_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
delete_pid_entry(int osnum, int pid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
struct procfs_list_entry *e;
e = add_procfs_entry(parent, "exe", S_IFLNK | 0777, uid, gid,
path);
e->data = kmalloc(strlen(path) + 1, GFP_KERNEL);
strcpy(e->data, path);
task = find_procfs_entry(parent, "task");
list_for_each_entry(parent, &task->children, list) {
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
uid, gid, path);
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void
procfs_exit(int osnum)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static loff_t
mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
static const struct file_operations mckernel_forward = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = mckernel_procfs_write,
};
static const struct procfs_entry tid_entry_stuff[] = {
// PROC_REG("auxv", S_IRUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("pagemap", S_IRUGO, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
// PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("coredump_filter", S_IRUGO|S_IWUSR, NULL),
PROC_REG("cpuset", S_IXUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
PROC_REG("pagemap", S_IRUGO, NULL),
PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
PROC_DIR("task", S_IRUGO|S_IXUGO),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry base_entry_stuff[] = {
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("cpuinfo", S_IRUGO, NULL),
// PROC_REG("meminfo", S_IRUGO, NULL),
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
// PROC_REG("softirq", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("uptime", S_IRUGO, NULL),
// PROC_REG("version", S_IRUGO, NULL),
// PROC_REG("vmallocinfo",S_IRUSR, NULL),
// PROC_REG("vmstat", S_IRUGO, NULL),
// PROC_REG("zoneinfo", S_IRUGO, NULL),
PROC_TERM
};

View File

@ -13,6 +13,8 @@
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 RIKEN AICS
*/
/*
* HISTORY:
@ -42,7 +44,9 @@
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
#include "../../config.h"
#include "mcctrl.h"
#include <linux/version.h>
#define ALIGN_WAIT_BUF(z) (((z + 63) >> 6) << 6)
@ -54,6 +58,17 @@
#define dprintk(...)
#endif
#ifdef MCCTRL_KSYM_zap_page_range
static void
(*mcctrl_zap_page_range)(struct vm_area_struct *vma, unsigned long start,
unsigned long size, struct zap_details *details)
#if MCCTRL_KSYM_zap_page_range
= (void *)MCCTRL_KSYM_zap_page_range;
#else
= &zap_page_range;
#endif
#endif
static long pager_call(ihk_os_t os, struct syscall_request *req);
#ifdef SC_DEBUG
@ -80,6 +95,13 @@ int init_peer_channel_registry(struct mcctrl_usrdata *ud)
return 0;
}
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud)
{
kfree(ud->keys);
ud->keys = NULL;
return;
}
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch)
{
int cpu;
@ -254,14 +276,14 @@ retry_alloc:
}
/* Prepare per-process wait queue head */
wqhln->pid = current->tgid;
wqhln->pid = task_tgid_vnr(current);
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock);
/* First see if there is a wait queue already */
list_for_each_entry(wqhln_iter, &channel->wq_list, list) {
if (wqhln_iter->pid == current->tgid) {
if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
@ -319,6 +341,109 @@ out:
return error;
}
#define RUS_PAGE_HASH_SHIFT 8
#define RUS_PAGE_HASH_SIZE (1UL << RUS_PAGE_HASH_SHIFT)
#define RUS_PAGE_HASH_MASK (RUS_PAGE_HASH_SIZE - 1)
struct list_head rus_page_hash[RUS_PAGE_HASH_SIZE];
spinlock_t rus_page_hash_lock;
struct rus_page {
struct list_head hash;
struct page *page;
int refcount;
int put_page;
};
void rus_page_hash_init(void)
{
int i;
spin_lock_init(&rus_page_hash_lock);
for (i = 0; i < RUS_PAGE_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&rus_page_hash[i]);
}
}
/* rus_page_hash_lock must be held */
struct rus_page *_rus_page_hash_lookup(struct page *page)
{
struct rus_page *rp = NULL;
struct rus_page *rp_iter;
list_for_each_entry(rp_iter,
&rus_page_hash[page_to_pfn(page) & RUS_PAGE_HASH_MASK], hash) {
if (rp_iter->page != page)
continue;
rp = rp_iter;
break;
}
return rp;
}
static int rus_page_hash_insert(struct page *page)
{
int ret = 0;
struct rus_page *rp;
spin_lock(&rus_page_hash_lock);
rp = _rus_page_hash_lookup(page);
if (!rp) {
rp = kmalloc(sizeof(*rp), GFP_ATOMIC);
if (!rp) {
printk("rus_page_add_hash(): error allocating rp\n");
ret = -ENOMEM;
goto out;
}
rp->page = page;
rp->put_page = 0;
get_page(page);
rp->refcount = 0; /* Will be increased below */
list_add_tail(&rp->hash,
&rus_page_hash[page_to_pfn(page) & RUS_PAGE_HASH_MASK]);
}
++rp->refcount;
out:
spin_unlock(&rus_page_hash_lock);
return ret;
}
void rus_page_hash_put_pages(void)
{
int i;
struct rus_page *rp_iter;
struct rus_page *rp_iter_next;
spin_lock(&rus_page_hash_lock);
for (i = 0; i < RUS_PAGE_HASH_SIZE; ++i) {
list_for_each_entry_safe(rp_iter, rp_iter_next,
&rus_page_hash[i], hash) {
list_del(&rp_iter->hash);
put_page(rp_iter->page);
kfree(rp_iter);
}
}
spin_unlock(&rus_page_hash_lock);
}
/*
* By remap_pfn_range(), VM_PFN_AT_MMAP may be raised.
* VM_PFN_AT_MMAP cause the following problems.
@ -329,6 +454,7 @@ out:
* These problems may be solved in linux-3.7.
* It uses vm_insert_pfn() until it is fixed.
*/
#define USE_VM_INSERT_PFN 1
static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@ -356,7 +482,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) {
if (ppd_iter->pid == current->tgid) {
if (ppd_iter->pid == task_tgid_vnr(current)) {
ppd = ppd_iter;
break;
}
@ -364,7 +490,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
if (!ppd) {
printk("ERROR: no per process data for pid %d\n", current->tgid);
printk("ERROR: no per process data for pid %d\n", task_tgid_vnr(current));
return VM_FAULT_SIGBUS;
}
@ -409,15 +535,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (pfn_valid(pfn+pix)) {
page = pfn_to_page(pfn+pix);
if (!page_count(page)) {
get_page(page);
/*
* TODO:
* The pages which get_page() has been called with
* should be recorded. Because these pages have to
* be passed to put_page() before they are freed.
*/
if ((error = rus_page_hash_insert(page)) < 0) {
printk("rus_vm_fault: error hashing page??\n");
}
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
if (error) {
printk("vm_insert_page: %d\n", error);
@ -448,7 +570,11 @@ static struct vm_operations_struct rus_vmops = {
static int rus_mmap(struct file *file, struct vm_area_struct *vma)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND | VM_MIXEDMAP;
#else
vma->vm_flags |= VM_DONTDUMP | VM_DONTEXPAND | VM_MIXEDMAP;
#endif
vma->vm_ops = &rus_vmops;
return 0;
}
@ -457,12 +583,10 @@ static struct file_operations rus_fops = {
.mmap = &rus_mmap,
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end)
{
struct file *file;
struct vm_area_struct *vma;
unsigned long start;
unsigned long end;
struct cred *promoted;
const struct cred *original;
@ -483,29 +607,22 @@ int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, un
cap_raise(promoted->cap_effective, CAP_SYS_RAWIO);
original = override_creds(promoted);
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
start = do_mmap_pgoff(file, 0, end,
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
start = vm_mmap_pgoff(file, start, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
up_write(&current->mm->mmap_sem);
#else
start = vm_mmap(file, start, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
#endif
revert_creds(original);
put_cred(promoted);
fput(file);
if (IS_ERR_VALUE(start)) {
printk("mcctrl:user space reservation failed.\n");
return start;
}
*startp = start;
*endp = end;
return 0;
return start;
}
//unsigned long last_thread_exec = 0;
@ -782,19 +899,22 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
error = vfs_fstat(fd, &st);
if (error) {
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
dprintk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
goto out;
}
if (!S_ISREG(st.mode)) {
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
/* treat memory devices as regular files */
}
else if (!S_ISREG(st.mode)) {
error = -ESRCH;
printk("pager_req_create(%d,%lx):not VREG. %x\n", fd, (long)result_pa, st.mode);
dprintk("pager_req_create(%d,%lx):not VREG. %x\n", fd, (long)result_pa, st.mode);
goto out;
}
file = fget(fd);
if (!file) {
error = -EBADF;
printk("pager_req_create(%d,%lx):file not found. %d\n", fd, (long)result_pa, error);
dprintk("pager_req_create(%d,%lx):file not found. %d\n", fd, (long)result_pa, error);
goto out;
}
@ -817,7 +937,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
}
if (!(maxprot & PROT_READ)) {
error = -EACCES;
printk("pager_req_create(%d,%lx):cannot read file. %d\n", fd, (long)result_pa, error);
dprintk("pager_req_create(%d,%lx):cannot read file. %d\n", fd, (long)result_pa, error);
goto out;
}
@ -1100,7 +1220,7 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off, uintptr_t r
struct pager_map_result *resp;
uintptr_t phys;
printk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa);
dprintk("pager_req_map(%p,%d,%lx,%lx,%lx)\n", os, fd, len, off, result_rpa);
pager = kzalloc(sizeof(*pager), GFP_KERNEL);
if (!pager) {
error = -ENOMEM;
@ -1128,8 +1248,17 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off, uintptr_t r
down_write(&current->mm->mmap_sem);
#define ANY_WHERE 0
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
va = do_mmap_pgoff(file, ANY_WHERE, len, maxprot, MAP_SHARED, pgoff);
#endif
up_write(&current->mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
va = vm_mmap(file, ANY_WHERE, len, maxprot, MAP_SHARED, pgoff << PAGE_SHIFT);
#endif
if (IS_ERR_VALUE(va)) {
printk("pager_req_map(%p,%d,%lx,%lx,%lx):do_mmap_pgoff failed. %d\n", os, fd, len, off, result_rpa, (int)va);
error = va;
@ -1140,6 +1269,9 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off, uintptr_t r
pager->map_uaddr = va;
pager->map_len = len;
pager->map_off = off;
dprintk("pager_req_map(%s): 0x%lx - 0x%lx (len: %lu)\n",
file->f_dentry->d_name.name, va, va + len, len);
phys = ihk_device_map_memory(dev, result_rpa, sizeof(*resp));
resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0);
@ -1158,10 +1290,11 @@ out:
if (pager) {
kfree(pager);
}
printk("pager_req_map(%p,%d,%lx,%lx,%lx): %d\n", os, fd, len, off, result_rpa, error);
dprintk("pager_req_map(%p,%d,%lx,%lx,%lx): %d\n", os, fd, len, off, result_rpa, error);
return error;
}
static int pager_req_pfn(ihk_os_t os, uintptr_t handle, off_t off, uintptr_t ppfn_rpa)
{
const ihk_device_t dev = ihk_os_to_dev(os);
@ -1176,7 +1309,7 @@ static int pager_req_pfn(ihk_os_t os, uintptr_t handle, off_t off, uintptr_t ppf
uintptr_t phys;
uintptr_t *ppfn;
printk("pager_req_pfn(%p,%lx,%lx)\n", os, handle, off);
dprintk("pager_req_pfn(%p,%lx,%lx)\n", os, handle, off);
if ((off < pager->map_off) || ((pager->map_off+pager->map_len) < (off + PAGE_SIZE))) {
error = -ERANGE;
@ -1201,6 +1334,12 @@ static int pager_req_pfn(ihk_os_t os, uintptr_t handle, off_t off, uintptr_t ppf
pfn = (uintptr_t)pte_pfn(*pte) << PAGE_SHIFT;
#define PFN_PRESENT ((uintptr_t)1 << 0)
pfn |= PFN_VALID | PFN_PRESENT;
/* Check if mapping is write-combined */
if ((pte_flags(*pte) & _PAGE_PWT) &&
!(pte_flags(*pte) & _PAGE_PCD)) {
pfn |= _PAGE_PWT;
}
}
pte_unmap(pte);
}
@ -1216,7 +1355,7 @@ static int pager_req_pfn(ihk_os_t os, uintptr_t handle, off_t off, uintptr_t ppf
error = 0;
out:
printk("pager_req_pfn(%p,%lx,%lx): %d %lx\n", os, handle, off, error, pfn);
dprintk("pager_req_pfn(%p,%lx,%lx): %d %lx\n", os, handle, off, error, pfn);
return error;
}
@ -1225,11 +1364,15 @@ static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
struct pager * const pager = (void *)handle;
int error;
printk("pager_req_unmap(%p,%lx)\n", os, handle);
dprintk("pager_req_unmap(%p,%lx)\n", os, handle);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
down_write(&current->mm->mmap_sem);
error = do_munmap(current->mm, pager->map_uaddr, pager->map_len);
up_write(&current->mm->mmap_sem);
#else
error = vm_munmap(pager->map_uaddr, pager->map_len);
#endif
if (error) {
printk("pager_req_unmap(%p,%lx):do_munmap failed. %d\n", os, handle, error);
@ -1237,7 +1380,7 @@ static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
}
kfree(pager);
printk("pager_req_unmap(%p,%lx): %d\n", os, handle, error);
dprintk("pager_req_unmap(%p,%lx): %d\n", os, handle, error);
return error;
}
@ -1325,9 +1468,18 @@ static int remap_user_space(uintptr_t rva, size_t len, int prot)
start = rva;
pgoff = vma->vm_pgoff + ((rva - vma->vm_start) >> PAGE_SHIFT);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
map = do_mmap_pgoff(file, start, len,
prot, MAP_FIXED|MAP_SHARED, pgoff);
#endif
up_write(&mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
map = vm_mmap(file, start, len,
prot, MAP_FIXED|MAP_SHARED, pgoff << PAGE_SHIFT);
#endif
out:
dprintk("remap_user_space(%lx,%lx,%x): %lx (%ld)\n",
rva, len, prot, (long)map, (long)map);
@ -1361,6 +1513,10 @@ static int clear_pte_range(uintptr_t start, uintptr_t len)
}
if (addr < end) {
error = zap_vma_ptes(vma, addr, end-addr);
if (error) {
mcctrl_zap_page_range(vma, addr, end-addr, NULL);
error = 0;
}
if (ret == 0) {
ret = error;
}
@ -1469,6 +1625,8 @@ fail:
return error;
}
#define SCHED_CHECK_SAME_OWNER 0x01
#define SCHED_CHECK_ROOT 0x02
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc)
{
@ -1495,7 +1653,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
goto out;
}
ppd->pid = current->tgid;
ppd->pid = task_tgid_vnr(current);
ppd->rpgtable = sc->args[2];
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
@ -1506,12 +1664,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
ppd->pid, ppd->rpgtable);
}
error = clear_pte_range(sc->args[0], sc->args[1]);
if (error) {
error = -ENOSYS;
goto out;
}
ret = 0;
ret = clear_pte_range(sc->args[0], sc->args[1]);
break;
case __NR_mprotect:
@ -1526,7 +1679,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) {
if (ppd_iter->pid == current->tgid) {
if (ppd_iter->pid == task_tgid_vnr(current)) {
ppd = ppd_iter;
break;
}
@ -1536,13 +1689,13 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
list_del(&ppd->list);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
current->tgid, current->pid, ppd->pid, ppd->rpgtable);
task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd);
}
else {
printk("WARNING: no per process data for pid %d ?\n",
current->tgid);
task_tgid_vnr(current));
}
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
@ -1556,6 +1709,71 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
error = writecore(os, sc->args[1], sc->args[0]);
ret = 0;
break;
case __NR_sched_setparam: {
switch (sc->args[0]) {
case SCHED_CHECK_SAME_OWNER: {
const struct cred *cred = current_cred();
const struct cred *pcred;
bool match;
struct task_struct *p;
int pid = sc->args[1];
rcu_read_lock();
p = pid_task(find_get_pid(pid), PIDTYPE_PID);
if (!p) {
rcu_read_unlock();
ret = -ESRCH;
goto sched_setparam_out;
}
rcu_read_unlock();
rcu_read_lock();
pcred = __task_cred(p);
#if LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0)
match = (uid_eq(cred->euid, pcred->euid) ||
uid_eq(cred->euid, pcred->uid));
#else
match = ((cred->euid == pcred->euid) ||
(cred->euid == pcred->uid));
#endif
rcu_read_unlock();
if (match) {
ret = 0;
}
else {
ret = -EPERM;
}
break;
}
case SCHED_CHECK_ROOT: {
const struct cred *cred = current_cred();
bool match;
#if LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0)
match = uid_eq(cred->euid, GLOBAL_ROOT_UID);
#else
match = (cred->euid == 0);
#endif
if (match) {
ret = 0;
}
else {
ret = -EPERM;
}
break;
}
}
sched_setparam_out:
break;
}
default:
error = -ENOSYS;

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCCTRL_SYSFS_H
#define MCCTRL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfsm_ops {
ssize_t (*show)(struct sysfsm_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfsm_ops *ops, void *instance,
const void *buf, size_t bufsize);
void (*release)(struct sysfsm_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfsm_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfsm_createf(ihk_os_t os, struct sysfsm_ops *ops, void *instance,
int mode, const char *fmt, ...);
extern int sysfsm_mkdirf(ihk_os_t os, sysfs_handle_t *dirhp,
const char *fmt, ...);
extern int sysfsm_symlinkf(ihk_os_t os, sysfs_handle_t targeth,
const char *fmt, ...);
extern int sysfsm_lookupf(ihk_os_t os, sysfs_handle_t *objhp,
const char *fmt, ...);
extern int sysfsm_unlinkf(ihk_os_t os, int flags, const char *fmt, ...);
extern void sysfsm_cleanup(ihk_os_t os);
extern void sysfsm_packet_handler(void *os, int msg, int err, long arg1,
long arg2);
#endif /* MCCTRL_SYSFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -0,0 +1,39 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR=@KMODDIR@
src = @abs_srcdir@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi)
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi)
else
ENABLE_BUILD=no
endif
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
ifeq ($(ENABLE_BUILD),yes)
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
endif
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
ifeq ($(ENABLE_BUILD),yes)
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)
endif

View File

@ -0,0 +1,416 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,951 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,438 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,557 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,411 +0,0 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include "mcctrl.h"
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
int mckernel_procfs_read(char *buffer, char **start, off_t offset,
int count, int *peof, void *dat);
/* A private data for the procfs driver. */
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct proc_dir_entry *parent;
ihk_os_t os;
int osnum;
int pid;
int cpu;
char fname[PROCFS_NAME_MAX];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
/**
* \brief Return specified procfs entry.
*
* \param p a name of the procfs file
* \param osnum os number
* \param mode if zero create a directory otherwise a file
*
* return value: NULL: Something wrong has occurred.
* otherwise: address of the proc_dir_entry structure of the procfs file
*
* p should not be NULL nor terminated by "/".
*
* We create a procfs entry if there is not already one.
* This process is recursive to the root of the procfs tree.
*/
/*
* XXX: Two or more entries which have same name can be created.
*
* get_procfs_entry() avoids creating an entry which has already been created.
* But, it allows creating an entry which is being created by another thread.
*
* This problem occurred when two requests which created files with a common
* ancestor directory which was not explicitly created were racing.
*/
static struct proc_dir_entry *get_procfs_entry(char *p, int osnum, int mode)
{
char *r;
struct proc_dir_entry *ret = NULL, *parent = NULL;
struct procfs_list_entry *e;
char name[PROCFS_NAME_MAX];
unsigned long irqflags;
dprintk("get_procfs_entry: %s for osnum %d mode %o\n", p, osnum, mode);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if (e == NULL) {
kprintf("ERROR: The procfs_file_list has a null entry.\n");
return NULL;
}
if (strncmp(e->fname, p, PROCFS_NAME_MAX) == 0) {
/* We found the entry */
ret = e->entry;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
if (ret != NULL) {
return ret;
}
r = strrchr(p, '/');
if (r != NULL) {
/* We have non-null parent dir. */
strncpy(name, p, r - p);
name[r - p] = '\0';
parent = get_procfs_entry(name, osnum, 0);
if (parent == NULL) {
/* We counld not get a parent procfs entry. Give up.*/
return NULL;
}
}
e = kmalloc(sizeof(struct procfs_list_entry), GFP_KERNEL);
if (e == NULL) {
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
/* Fill the fname field of the entry */
strncpy(e->fname, p, PROCFS_NAME_MAX);
if (r != NULL) {
strncpy(name, r + 1, p + PROCFS_NAME_MAX - r - 1);
} else {
strncpy(name, p, PROCFS_NAME_MAX);
}
if (mode == 0) {
ret = proc_mkdir(name, parent);
} else {
ret = create_proc_entry(name, mode, parent);
}
if (ret == NULL) {
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", p);
kfree(e);
return NULL;
}
ret->data = e;
e->osnum = osnum;
e->entry = ret;
e->parent = parent;
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_add(&(e->list), &procfs_file_list);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
dprintk("get_procfs_entry: %s done\n", p);
return ret;
}
/**
* \brief Create a procfs entry.
*
* \param __os (opeque) os variable
* \param ref cpuid of the requesting mckernel process
* \param osnum osnum of the requesting mckernel process
* \param pid pid of the requesting mckernel process
* \param arg sent argument
*/
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg)
{
struct proc_dir_entry *entry;
struct procfs_list_entry *e;
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
int mode;
char name[PROCFS_NAME_MAX];
dprintk("procfs_create: osnum: %d, cpu: %d, pid: %d\n", osnum, ref, pid);
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("name: %s mode: %o\n", f->fname, f->mode);
strncpy(name, f->fname, PROCFS_NAME_MAX);
mode = f->mode;
if (name[PROCFS_NAME_MAX - 1] != '\0') {
printk("ERROR: procfs_creat: file name not properly terminated.\n");
goto quit;
}
entry = get_procfs_entry(name, osnum, mode);
if (entry == NULL) {
printk("ERROR: could not create a procfs entry for %s.\n", name);
goto quit;
}
e = entry->data;
e->os = __os;
e->cpu = ref;
e->pid = pid;
entry->read_proc = mckernel_procfs_read;
quit:
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_create: done\n");
}
/**
* \brief Delete a procfs entry.
*
* \param __os (opaque) os variable
* \param osnum os number
* \param arg sent argument
*/
void procfs_delete(void *__os, int osnum, unsigned long arg)
{
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
struct procfs_list_entry *e;
struct proc_dir_entry *parent = NULL;
char name[PROCFS_NAME_MAX];
char *r;
unsigned long irqflags;
dprintk("procfs_delete: \n");
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("fname: %s.\n", f->fname);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if ((strncmp(e->fname, f->fname, PROCFS_NAME_MAX) == 0) &&
(e->osnum == osnum)) {
list_del(&e->list);
e->entry->read_proc = NULL;
e->entry->data = NULL;
parent = e->parent;
kfree(e);
r = strrchr(f->fname, '/');
if (r == NULL) {
strncpy(name, f->fname, PROCFS_NAME_MAX);
} else {
strncpy(name, r + 1, PROCFS_NAME_MAX);
}
dprintk("found and remove %s from the list.\n", name);
remove_proc_entry(name, parent);
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_delete: done\n");
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
int mckernel_procfs_read(char *buffer, char **start, off_t offset,
int count, int *peof, void *dat)
{
struct procfs_list_entry *e = dat;
volatile struct procfs_read *r;
struct ikc_scd_packet isp;
int ret, retrycount = 0;
unsigned long pbuf;
dprintk("mckernel_procfs_read: invoked for %s\n", e->fname);
if (count <= 0 || dat == NULL || offset < 0) {
return 0;
}
pbuf = virt_to_phys(buffer);
if (pbuf / PAGE_SIZE != (pbuf + count - 1) / PAGE_SIZE) {
/* Truncate the read count upto the nearest page boundary */
count = ((pbuf + count - 1) / PAGE_SIZE) * PAGE_SIZE - pbuf;
}
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
return -ENOMEM;
}
retry:
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
strncpy((char *)r->fname, e->fname, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = e->cpu;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(e->os, e->cpu, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if ((r->ret == 0) && (r->eof != 1)) {
/* A miss-hit caused by migration has occurred.
* We simply retry the query with a new CPU.
*/
if (retrycount++ > 10) {
kprintf("ERROR: mckernel_procfs_read: excessive retry.\n");
goto out;
}
e->cpu = r->newcpu;
dprintk("retry\n");
goto retry;
}
if (r->eof == 1) {
dprintk("reached end of file.\n");
*peof = 1;
}
*start = buffer;
ret = r->ret;
out:
kfree((void *)r);
return ret;
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void procfs_init(int osnum) {
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void procfs_exit(int osnum) {
char buf[20], *r;
int error;
mm_segment_t old_fs = get_fs();
struct kstat stat;
struct proc_dir_entry *parent;
struct procfs_list_entry *e, *temp = NULL;
unsigned long irqflags;
dprintk("remove remaining mckernel procfs files.\n");
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry_safe(e, temp, &procfs_file_list, list) {
if (e->osnum == osnum) {
dprintk("found entry for %s.\n", e->fname);
list_del(&e->list);
e->entry->read_proc = NULL;
e->entry->data = NULL;
parent = e->parent;
r = strrchr(e->fname, '/');
if (r == NULL) {
r = e->fname;
} else {
r += 1;
}
remove_proc_entry(r, parent);
dprintk("free the entry\n");
kfree(e);
}
dprintk("iterate it.\n");
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
sprintf(buf, "/proc/mcos%d", osnum);
set_fs(KERNEL_DS);
error = vfs_stat (buf, &stat);
set_fs(old_fs);
if (error != 0) {
return;
}
printk("procfs_exit: We have to remove unexpectedly remaining %s.\n", buf);
/* remove remnant of previous mcos%d */
remove_proc_entry(buf + 6, NULL);
}

View File

@ -1,13 +1,19 @@
CC=@CC@
BINDIR=@BINDIR@
CFLAGS=-Wall -O -fPIE -pie
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@
TARGET=mcexec
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
all: $(TARGET)
mcexec: mcexec.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -pthread -o $@ $^ $(EXTRA_OBJS)
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
clean:
$(RM) $(TARGET) *.o
@ -17,4 +23,5 @@ clean:
install:
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

966
executer/user/eclair.c Normal file
View File

@ -0,0 +1,966 @@
/**
* \file eclair.c
* License details are found in the file LICENSE.
* \brief
* IHK os memory dump analyzer for McKernel
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
#include <bfd.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#define CPU_TID_BASE 1000000
struct options {
uint8_t cpu;
uint8_t help;
char *kernel_path;
char *dump_path;
char *log_path;
}; /* struct options */
struct thread_info {
struct thread_info *next;
int status;
#define PS_RUNNING 0x01
#define PS_INTERRUPTIBLE 0x02
#define PS_UNINTERRUPTIBLE 0x04
#define PS_STOPPED 0x20
#define PS_TRACED 0x40
#define CS_IDLE 0x010000
#define CS_RUNNING 0x020000
#define CS_RESERVED 0x030000
int pid;
int tid;
int cpu;
int lcpu;
int padding;
uintptr_t process;
uintptr_t clv;
uintptr_t x86_clv;
}; /* struct thread_info */
static struct options opt;
static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
static uintptr_t kernel_base;
static struct thread_info *tihead = NULL;
static struct thread_info **titailp = &tihead;
static struct thread_info *curr_thread = NULL;
static uintptr_t ihk_mc_switch_context = -1;
static uintptr_t lookup_symbol(char *name) {
int i;
for (i = 0; i < nsyms; ++i) {
if (!strcmp(symtab[i]->name, name)) {
return (symtab[i]->section->vma + symtab[i]->value);
}
}
#define NOSYMBOL ((uintptr_t)-1)
return NOSYMBOL;
} /* lookup_symbol() */
static uintptr_t virt_to_phys(uintptr_t va) {
#define MAP_KERNEL 0xFFFFFFFF80000000
if (va >= MAP_KERNEL) {
return (va - MAP_KERNEL + kernel_base);
}
#define MAP_ST 0xFFFF800000000000
if (va >= MAP_ST) {
return (va - MAP_ST);
}
if (0) printf("virt_to_phys(%lx): -1\n", va);
#define NOPHYS ((uintptr_t)-1)
return NOPHYS;
} /* virt_to_phys() */
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
static int read_mem(uintptr_t va, void *buf, size_t size) {
uintptr_t pa;
int error;
pa = virt_to_phys(va);
if (pa == NOPHYS) {
if (0) {
/* NOPHYS is usual for 'bt' command */
perror("read_mem:virt_to_phys");
}
return 1;
}
error = read_physmem(pa, buf, size);
if (error) {
perror("read_mem:read_physmem");
return 1;
}
return 0;
} /* read_mem() */
static int read_64(uintptr_t va, void *buf) {
return read_mem(va, buf, sizeof(uint64_t));
} /* read_64() */
static int read_32(uintptr_t va, void *buf) {
return read_mem(va, buf, sizeof(uint32_t));
} /* read_32() */
static int read_symbol_64(char *name, void *buf) {
uintptr_t va;
int error;
va = lookup_symbol(name);
if (va == NOSYMBOL) {
printf("read_symbol_64(%s):lookup_symbol failed\n", name);
return 1;
}
error = read_64(va, buf);
if (error) {
printf("read_symbol_64(%s):read_64(%#lx) failed", name, va);
return 1;
}
return 0;
} /* read_symbol_64() */
enum {
/* cpu_local_var */
CPU_LOCAL_VAR_SIZE = 0,
CURRENT_OFFSET,
RUNQ_OFFSET,
CPU_STATUS_OFFSET,
/* process */
CTX_OFFSET,
SCHED_LIST_OFFSET,
PROC_OFFSET,
/* fork_tree_node */
STATUS_OFFSET,
PID_OFFSET,
TID_OFFSET,
END_MARK,
}; /* enum */
static uintptr_t debug_constants[END_MARK+1];
#define K(name) (debug_constants[name])
static int setup_constants(void) {
int error;
uintptr_t va;
va = lookup_symbol("debug_constants");
if (va == NOSYMBOL) {
perror("debug_constants");
return 1;
}
error = read_mem(va, debug_constants, sizeof(debug_constants));
if (error) {
perror("debug_constants");
return 1;
}
if (0) {
printf("CPU_LOCAL_VAR_SIZE: %ld\n", K(CPU_LOCAL_VAR_SIZE));
printf("CURRENT_OFFSET: %ld\n", K(CURRENT_OFFSET));
printf("RUNQ_OFFSET: %ld\n", K(RUNQ_OFFSET));
printf("CPU_STATUS_OFFSET: %ld\n", K(CPU_STATUS_OFFSET));
printf("CTX_OFFSET: %ld\n", K(CTX_OFFSET));
printf("SCHED_LIST_OFFSET: %ld\n", K(SCHED_LIST_OFFSET));
printf("PROC_OFFSET: %ld\n", K(PROC_OFFSET));
printf("STATUS_OFFSET: %ld\n", K(STATUS_OFFSET));
printf("PID_OFFSET: %ld\n", K(PID_OFFSET));
printf("TID_OFFSET: %ld\n", K(TID_OFFSET));
printf("END_MARK: %ld\n", K(END_MARK));
}
return 0;
} /* setup_constants() */
static int setup_threads(void) {
int error;
uintptr_t clv;
int cpu;
uintptr_t current;
uintptr_t locals;
size_t locals_span;
error = read_symbol_64("num_processors", &num_processors);
if (error) {
perror("num_processors");
return 1;
}
error = read_symbol_64("locals", &locals);
if (error) {
perror("locals");
return 1;
}
error = read_symbol_64("x86_cpu_local_variables_span", &locals_span);
if (error) {
locals_span = 4096;
}
if (0) printf("locals 0x%lx span 0x%lx\n", locals, locals_span);
error = read_symbol_64("clv", &clv);
if (error) {
perror("clv");
return 1;
}
ihk_mc_switch_context = lookup_symbol("ihk_mc_switch_context");
if (0) printf("ihk_mc_switch_context: %lx\n", ihk_mc_switch_context);
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t head;
uintptr_t entry;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
error = read_64(v+K(CURRENT_OFFSET), &current);
if (error) {
perror("current");
return 1;
}
head = v + K(RUNQ_OFFSET);
error = read_64(head, &entry);
if (error) {
perror("runq head");
return 1;
}
while (entry != head) {
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = entry - K(SCHED_LIST_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = (thread == current)? cpu: -1;
ti->lcpu = cpu;
ti->process = thread;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
error = read_64(entry, &entry);
if (error) {
perror("process2");
return 1;
}
}
}
if (!tihead) {
printf("thread not found. cpu mode forcibly\n");
opt.cpu = 1;
}
if (opt.cpu) {
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
struct thread_info *ti;
int status;
uintptr_t current;
v = clv + K(CPU_LOCAL_VAR_SIZE)*cpu;
error = read_32(v+K(CPU_STATUS_OFFSET), &status);
if (error) {
perror("cpu.status");
return 1;
}
if (!status) {
continue;
}
error = read_64(v+K(CURRENT_OFFSET), &current);
if (error) {
perror("current");
return 1;
}
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
ti->next = NULL;
ti->status = status << 16;
ti->pid = CPU_TID_BASE + cpu;
ti->tid = CPU_TID_BASE + cpu;
ti->cpu = cpu;
ti->process = current;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
}
}
if (!tihead) {
printf("thread not found\n");
return 1;
}
curr_thread = tihead;
return 0;
} /* setup_threads() */
static int setup_symbols(char *fname) {
ssize_t needs;
bfd_boolean ok;
symbfd = bfd_openr(fname, "elf64-x86-64");
if (!symbfd) {
bfd_perror("bfd_openr");
return 1;
}
ok = bfd_check_format(symbfd, bfd_object);
if (!ok) {
bfd_perror("bfd_check_format");
return 1;
}
needs = bfd_get_symtab_upper_bound(symbfd);
if (needs < 0) {
bfd_perror("bfd_get_symtab_upper_bound");
return 1;
}
if (!needs) {
printf("no symbols\n");
return 1;
}
symtab = malloc(needs);
if (!symtab) {
perror("malloc");
return 1;
}
nsyms = bfd_canonicalize_symtab(symbfd, symtab);
if (nsyms < 0) {
bfd_perror("bfd_canonicalize_symtab");
return 1;
}
return 0;
} /* setup_symbols() */
static int setup_dump(char *fname) {
bfd_boolean ok;
dumpbfd = bfd_fopen(opt.dump_path, "elf64-x86-64", "r", -1);
if (!dumpbfd) {
bfd_perror("bfd_fopen");
return 1;
}
ok = bfd_check_format(dumpbfd, bfd_object);
if (!ok) {
bfd_perror("bfd_check_format");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
kernel_base = dumpscn->vma + 0x200000;
return 0;
} /* setup_dump() */
static ssize_t print_hex(char *buf, char *str) {
char *p;
char *q;
q = buf;
for (p = str; *p != '\0'; ++p) {
q += sprintf(q, "%02x", *p);
}
*q = '\0';
return (q - buf);
} /* print_hex() */
static ssize_t print_bin(char *buf, void *data, size_t size) {
uint8_t *p;
char *q;
int i;
p = data;
q = buf;
for (i = 0; i < size; ++i) {
q += sprintf(q, "%02x", *p);
++p;
}
*q = '\0';
return (q - buf);
} /* print_bin() */
static void command(char *cmd, char *res) {
char *p;
char *rbp;
p = cmd;
rbp = res;
do {
if (!strncmp(p, "qSupported", 10)) {
rbp += sprintf(rbp, "PacketSize=1024");
rbp += sprintf(rbp, ";qXfer:features:read+");
}
else if (!strncmp(p, "Hg", 2)) {
int n;
int tid;
struct thread_info *ti;
p += 2;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'Hg' cmd: \"%s\"\n", p);
break;
}
if (tid) {
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
curr_thread = ti;
}
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "Hc-1")) {
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "?")) {
rbp += sprintf(rbp, "S02");
}
else if (!strcmp(p, "qC")) {
rbp += sprintf(rbp, "QC%x", curr_thread->tid);
}
else if (!strcmp(p, "qAttached")) {
rbp += sprintf(rbp, "1");
}
else if (!strncmp(p, "qXfer:features:read:target.xml:", 31)) {
char *str =
"<target version=\"1.0\">"
"<architecture>i386:x86-64</architecture>"
"</target>";
rbp += sprintf(rbp, "l");
if (0)
rbp += print_hex(rbp, str);
rbp += sprintf(rbp, "%s", str);
}
else if (!strcmp(p, "D")) {
rbp += sprintf(rbp, "OK");
f_done = 1;
}
else if (!strcmp(p, "g")) {
if (curr_thread->cpu < 0) {
struct x86_kregs {
uintptr_t rsp, rbp, rbx, rsi;
uintptr_t rdi, r12, r13, r14;
uintptr_t r15, rflags, rsp0;
};
int error;
struct x86_kregs kregs;
error = read_mem(curr_thread->process+K(CTX_OFFSET),
&kregs, sizeof(kregs));
if (error) {
perror("read_mem");
break;
}
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rax */
rbp += print_bin(rbp, &kregs.rbx, sizeof(uint64_t));
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rcx */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rdx */
rbp += print_bin(rbp, &kregs.rsi, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rdi, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rbp, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rsp, sizeof(uint64_t));
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r8 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r9 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r10 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r11 */
rbp += print_bin(rbp, &kregs.r12, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r13, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r14, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r15, sizeof(uint64_t));
rbp += print_bin(rbp, &ihk_mc_switch_context,
sizeof(uint64_t)); /* rip */
rbp += print_bin(rbp, &kregs.rflags, sizeof(uint32_t));
rbp += sprintf(rbp, "xxxxxxxx"); /* cs */
rbp += sprintf(rbp, "xxxxxxxx"); /* ss */
rbp += sprintf(rbp, "xxxxxxxx"); /* ds */
rbp += sprintf(rbp, "xxxxxxxx"); /* es */
rbp += sprintf(rbp, "xxxxxxxx"); /* fs */
rbp += sprintf(rbp, "xxxxxxxx"); /* gs */
}
else {
int error;
uintptr_t regs[21];
uint8_t *pu8;
int i;
error = read_mem(curr_thread->x86_clv+240,
&regs, sizeof(regs));
if (error) {
perror("read_mem");
break;
}
pu8 = (void *)&regs;
for (i = 0; i < sizeof(regs)-4; ++i) {
rbp += sprintf(rbp, "%02x", pu8[i]);
}
}
}
else if (!strcmp(p, "mffffffff80018a82,1")) {
rbp += sprintf(rbp, "b8");
}
else if (!strcmp(p, "mffffffff80018a82,9")) {
rbp += sprintf(rbp, "b8f2ffffff41564155");
}
else if (!strncmp(p, "m", 1)) {
int n;
uintptr_t start;
size_t size;
uintptr_t addr;
int error;
uint8_t u8;
++p;
n = sscanf(p, "%lx,%lx", &start, &size);
if (n != 2) {
break;
}
for (addr = start; addr < (start + size); ++addr) {
error = read_mem(addr, &u8, sizeof(u8));
if (error) {
u8 = 0xE5;
}
rbp += sprintf(rbp, "%02x", u8);
}
}
else if (!strcmp(p, "qTStatus")) {
rbp += sprintf(rbp, "T0;tnotrun:0");
}
else if (!strncmp(p, "qXfer:memory-map:read::", 23)) {
char *str =
"<memory-map>"
"<memory type=\"rom\" start=\"0xffffffff80001000\" length=\"0x27000\"/>"
"</memory-map>";
rbp += sprintf(rbp, "l");
if (0)
rbp += print_hex(rbp, str);
rbp += sprintf(rbp, "%s", str);
}
else if (!strncmp(p, "T", 1)) {
int n;
int tid;
struct thread_info *ti;
p += 1;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'T' cmd: \"%s\"\n", p);
break;
}
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "qfThreadInfo")) {
struct thread_info *ti;
for (ti = tihead; ti; ti = ti->next) {
if (ti == tihead) {
rbp += sprintf(rbp, "m%x", ti->tid);
}
else {
rbp += sprintf(rbp, ",%x", ti->tid);
}
}
}
else if (!strcmp(p, "qsThreadInfo")) {
rbp += sprintf(rbp, "l");
}
else if (!strncmp(p, "qThreadExtraInfo,", 17)) {
int n;
int tid;
struct thread_info *ti;
char buf[64];
char *q;
p += 17;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'qThreadExtraInfo' cmd: \"%s\"\n", p);
break;
}
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
q = buf;
if (ti->status & PS_RUNNING) {
q += sprintf(q, "running on cpu%d", ti->cpu);
}
else if (ti->status & (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)) {
q += sprintf(q, "waiting on cpu%d", ti->lcpu);
}
else if (ti->status & PS_STOPPED) {
q += sprintf(q, "stopped on cpu%d", ti->lcpu);
}
else if (ti->status & PS_TRACED) {
q += sprintf(q, "traced on cpu%d", ti->lcpu);
}
else if (ti->status == CS_IDLE) {
q += sprintf(q, "cpu%d idle", ti->cpu);
}
else if (ti->status == CS_RUNNING) {
q += sprintf(q, "cpu%d running", ti->cpu);
}
else if (ti->status == CS_RESERVED) {
q += sprintf(q, "cpu%d reserved", ti->cpu);
}
else {
q += sprintf(q, "status=%#x", ti->status);
}
if (ti->tid != ti->pid) {
q += sprintf(q, ",pid=%d", ti->pid);
}
rbp += print_hex(rbp, buf);
}
} while (0);
*rbp = '\0';
return;
} /* command() */
static void options(int argc, char *argv[]) {
memset(&opt, 0, sizeof(opt));
opt.kernel_path = "./mckernel.img";
opt.dump_path = "./mcdump";
for (;;) {
int c;
c = getopt(argc, argv, "cd:hk:");
if (c < 0) {
break;
}
switch (c) {
case 'h':
case '?':
opt.help = 1;
break;
case 'c':
opt.cpu = 1;
break;
case 'k':
opt.kernel_path = optarg;
break;
case 'd':
opt.dump_path = optarg;
break;
}
}
if (optind < argc) {
opt.help = 1;
}
return;
} /* options() */
static int sock = -1;
static FILE *ifp = NULL;
static FILE *ofp = NULL;
static int start_gdb(void) {
struct sockaddr_in sin;
socklen_t slen;
int error;
pid_t pid;
int ss;
sock = socket(PF_INET, SOCK_STREAM, 0);
if (sock < 0) {
perror("socket");
return 1;
}
error = listen(sock, SOMAXCONN);
if (error) {
perror("listen");
return 1;
}
slen = sizeof(sin);
error = getsockname(sock, (struct sockaddr *)&sin, &slen);
if (error) {
perror("getsockname");
return 1;
}
pid = fork();
if (pid == (pid_t)-1) {
perror("fork");
return 1;
}
if (!pid) {
char buf[32];
sprintf(buf, "target remote :%d", ntohs(sin.sin_port));
execlp("gdb", "eclair", "-q", "-ex", "set prompt (eclair) ",
"-ex", buf, opt.kernel_path, NULL);
perror("execlp");
return 3;
}
ss = accept(sock, NULL, NULL);
if (ss < 0) {
perror("accept");
return 1;
}
ifp = fdopen(ss, "r");
if (!ifp) {
perror("fdopen(r)");
return 1;
}
ofp = fdopen(ss, "r+");
if (!ofp) {
perror("fdopen(r+)");
return 1;
}
return 0;
} /* start_gdb() */
static void print_usage(void) {
fprintf(stderr, "usage: eclair [-ch] [-d <mcdump>] [-k <kernel.img>]\n");
return;
} /* print_usage() */
int main(int argc, char *argv[]) {
int c;
int error;
int mode;
uint8_t sum;
uint8_t check;
static char lbuf[1024];
static char rbuf[1024];
static char cbuf[3];
char *lbp;
char *p;
printf("eclair 0.20160314\n");
options(argc, argv);
if (opt.help) {
print_usage();
return 2;
}
error = setup_symbols(opt.kernel_path);
if (error) {
perror("setup_symbols");
print_usage();
return 1;
}
error = setup_dump(opt.dump_path);
if (error) {
perror("setup_dump");
print_usage();
return 1;
}
error = setup_constants();
if (error) {
perror("setup_constants");
return 1;
}
error = setup_threads();
if (error) {
perror("setup_threads");
return 1;
}
error = start_gdb();
if (error) {
perror("start_gdb");
return 1;
}
mode = 0;
sum = 0;
lbp = NULL;
while (!f_done) {
c = fgetc(ifp);
if (c < 0) {
break;
}
if (mode == 0) {
if (c == '$') {
mode = 1;
sum = 0;
lbp = lbuf;
continue;
}
}
if (mode == 1) {
if (c == '#') {
mode = 2;
*lbp = '\0';
continue;
}
sum += c;
*lbp++ = c;
}
if (mode == 2) {
cbuf[0] = c;
mode = 3;
continue;
}
if (mode == 3) {
cbuf[1] = c;
cbuf[2] = '\0';
check = strtol(cbuf, NULL, 16);
if (check != sum) {
mode = 0;
fputc('-', ofp);
continue;
}
mode = 0;
fputc('+', ofp);
command(lbuf, rbuf);
sum = 0;
for (p = rbuf; *p != '\0'; ++p) {
sum += *p;
}
fprintf(ofp, "$%s#%02x", rbuf, sum);
fflush(ofp);
continue;
}
}
return 0;
} /* main() */

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__
CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS)
ifeq ("$(DCFA_MODE)", "kmod")

View File

@ -3,10 +3,10 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o

View File

@ -24,20 +24,23 @@
#include <process.h>
#include <init.h>
#include <march.h>
#include <cls.h>
int num_processors = 1;
static volatile int ap_stop = 1;
static void ap_wait(void)
{
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
init_tick();
while (ap_stop) {
barrier();
cpu_pause();
}
sync_tick();
kmalloc_init();
sched_init();
arch_start_pvclock();
if (find_command_line("hidos")) {
init_host_syscall_channel();
@ -53,7 +56,9 @@ static void ap_wait(void)
void ap_start(void)
{
init_tick();
ap_stop = 0;
sync_tick();
}
void ap_init(void)
@ -63,9 +68,8 @@ void ap_init(void)
int bsp_hw_id;
ihk_mc_init_ap();
init_delay();
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
cpu_info = ihk_mc_get_cpu_info();
bsp_hw_id = ihk_mc_get_hardware_processor_id();
@ -74,18 +78,154 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d, ", bsp_hw_id);
kprintf("AP Booting :");
kprintf("BSP HW ID = %d\n", bsp_hw_id);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d)\n", i, cpu_info->hw_ids[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
kprintf(" %d", cpu_info->hw_ids[i]);
num_processors++;
}
kprintf(" .. Done\n");
kprintf("AP Booting: Done\n");
}
#include <sysfs.h>
#include <kmalloc.h>
#include <string.h>
#include <vsprintf.h>
static ssize_t
show_int(struct sysfs_ops *ops, void *instance, void *buf, size_t size)
{
int *p = instance;
return snprintf(buf, size, "%d\n", *p);
}/* show_int() */
struct sysfs_ops show_int_ops = {
.show = &show_int,
};
struct fake_cpu_info {
int online;
};
static struct fake_cpu_info *fake_cpu_infos = NULL;
enum fake_cpu_info_member {
ONLINE,
};
struct fake_cpu_info_ops {
enum fake_cpu_info_member member;
struct sysfs_ops ops;
};
static ssize_t
show_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
n = snprintf(buf, size, "%d\n", info->online);
break;
default:
n = -EINVAL;
break;
}
if (n >= size) {
n = -ENOSPC;
}
return n;
} /* show_fake_cpu_info() */
static ssize_t
store_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
kprintf("NYI:store_fake_cpu_info(%p,%p,%p,%ld): "
"online %d --> \"%.*s\"\n",
ops0, instance, buf, size, info->online,
(int)size, buf);
n = size;
break;
default:
n = -EIO;
break;
}
return n;
} /* store_fake_cpu_info() */
static struct fake_cpu_info_ops show_fci_online = {
.member = ONLINE,
.ops.show = &show_fake_cpu_info,
.ops.store = &store_fake_cpu_info,
};
void
cpu_sysfs_setup(void)
{
int error;
int cpu;
sysfs_handle_t targeth;
struct fake_cpu_info *info;
/* sample of simple variable **********************************/
error = sysfs_createf(&show_int_ops, &num_processors, 0444,
"/sys/devices/system/cpu/num_processors");
if (error) {
panic("cpu_sysfs_setup:sysfs_createf(num_processors) failed\n");
}
/* sample of more complex variable ****************************/
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
}
fake_cpu_infos = info;
/* setup sysfs tree */
for (cpu = 0; cpu < num_processors; ++cpu) {
/* online */
error = sysfs_createf(&show_fci_online.ops,
&fake_cpu_infos[cpu], 0644,
"/sys/devices/system/cpu/cpu%d/online", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_createf failed\n");
}
/* link to cpu%d */
error = sysfs_lookupf(&targeth,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_lookupf failed\n");
}
error = sysfs_symlinkf(targeth, "/sys/bus/cpu/devices/cpu%d",
cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_symlinkf failed\n");
}
}
return;
} /* cpu_sysfs_setup() */

View File

@ -23,6 +23,7 @@
extern int num_processors;
struct cpu_local_var *clv;
static int cpu_local_var_initialized = 0;
void cpu_local_var_init(void)
{
@ -33,9 +34,22 @@ void cpu_local_var_init(void)
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}
struct cpu_local_var *get_cpu_local_var(int id)
{
return clv + id;
}
void preempt_enable(void)
{
if (cpu_local_var_initialized)
--cpu_local_var(no_preempt);
}
void preempt_disable(void)
{
if (cpu_local_var_initialized)
++cpu_local_var(no_preempt);
}

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4
@ -39,8 +43,4 @@ SECTIONS
. = ALIGN(4096);
_end = .;
/DISCARD/ : {
*(.eh_frame)
*(.note.gnu.build-id)
}
}

View File

@ -1,6 +1,5 @@
CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -3,6 +3,5 @@ LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump
OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/builtin-x86.lds

View File

@ -0,0 +1 @@
LDFLAGS += -T $(SRC)/config/smp-x86.lds

45
kernel/config/smp-x86.lds Normal file
View File

@ -0,0 +1,45 @@
PHDRS
{
text PT_LOAD FLAGS(5);
data PT_LOAD FLAGS(7);
}
SECTIONS
{
. = 0xffffffff80001000;
_head = .;
.text : {
*(.text);
} : text
. = ALIGN(4096);
.data : {
*(.data)
*(.data.*)
} :data
.rodata : {
*(.rodata .rodata.*)
} :data
.vsyscall : ALIGN(0x1000) {
vsyscall_page = .;
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4
.bss : {
*(.bss .bss.*)
}
. = ALIGN(4096);
_end = .;
}

View File

@ -22,13 +22,44 @@ extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern int sprintf(char * buf, const char *fmt, ...);
static ihk_spinlock_t kmsg_lock;
static unsigned long kprintf_lock_head(void);
static void kprintf_unlock_head(unsigned long irqflags);
static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
int head, tail, buf_len, mode, adj;
mode = kmsg_buf.mode;
while (1) {
adj = 0;
tail = kmsg_buf.tail;
buf_len = kmsg_buf.len;
head = kmsg_buf.head;
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
if (mode != 1) {
*slide = 1;
break;
} else {
kprintf_unlock_head(*flags_head);
*flags_head = kprintf_lock_head();
}
} else {
break;
}
}
}
/* TODO: lock */
void kputs(char *buf)
{
int len = strlen(buf);
unsigned long flags;
int slide = 0;
unsigned long flags_tail, flags_head;
flags = ihk_mc_spinlock_lock(&kmsg_lock);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
if (len + kmsg_buf.tail > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -39,27 +70,43 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
}
#define KPRINTF_LOCAL_BUF_LEN 1024
int kprintf_lock()
unsigned long kprintf_lock(void)
{
return ihk_mc_spinlock_lock(&kmsg_lock);
return __ihk_mc_spinlock_lock(&kmsg_lock);
}
void kprintf_unlock(int irqflags)
void kprintf_unlock(unsigned long irqflags)
{
ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
__ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
}
static unsigned long kprintf_lock_head(void)
{
return __ihk_mc_spinlock_lock(&kmsg_buf.lock);
}
static void kprintf_unlock_head(unsigned long irqflags)
{
__ihk_mc_spinlock_unlock(&kmsg_buf.lock, irqflags);
}
/* Caller must hold kmsg_lock! */
int __kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
@ -67,6 +114,9 @@ int __kprintf(const char *format, ...)
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -74,25 +124,33 @@ int __kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags;
unsigned long flags_tail, flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
flags = ihk_mc_spinlock_lock(&kmsg_lock);
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -100,16 +158,24 @@ int kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
return len;
}
void kmsg_init(void)
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);
kmsg_buf.tail = 0;
kmsg_buf.len = sizeof(kmsg_buf.str);
kmsg_buf.head = 0;
kmsg_buf.mode = mode;
ihk_mc_spinlock_init(&kmsg_buf.lock);
memset(kmsg_buf.str, 0, kmsg_buf.len);
}

View File

@ -3,7 +3,8 @@
* License details are found in the file LICENSE.
* \brief
* memory mapped device pager client
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 RIKEN AICS
*/
/*
* HISTORY:
@ -32,9 +33,18 @@
#include <pager.h>
#include <string.h>
#include <syscall.h>
#include <process.h>
//#define DEBUG_PRINT_DEVOBJ
#ifdef DEBUG_PRINT_DEVOBJ
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
#define dkprintf(...)
#define ekprintf(...) kprintf(__VA_ARGS__)
struct devobj {
struct memobj memobj; /* must be first */
@ -76,7 +86,7 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
kprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
dkprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
@ -111,8 +121,8 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
goto out;
}
kprintf("devobj_create:handle: %lx\n", result.handle);
kprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("devobj_create:handle: %lx\n", result.handle);
dkprintf("devobj_create:maxprot: %x\n", result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -134,7 +144,7 @@ out:
}
kfree(obj);
}
kprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
return error;
}
@ -142,7 +152,7 @@ static void devobj_ref(struct memobj *memobj)
{
struct devobj *obj = to_devobj(memobj);
kprintf("devobj_ref(%p %lx):\n", obj, obj->handle);
dkprintf("devobj_ref(%p %lx):\n", obj, obj->handle);
memobj_lock(&obj->memobj);
++obj->ref;
memobj_unlock(&obj->memobj);
@ -155,7 +165,7 @@ static void devobj_release(struct memobj *memobj)
struct devobj *free_obj = NULL;
uintptr_t handle;
kprintf("devobj_release(%p %lx)\n", obj, obj->handle);
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
memobj_lock(&obj->memobj);
--obj->ref;
@ -187,12 +197,12 @@ static void devobj_release(struct memobj *memobj)
kfree(free_obj);
}
kprintf("devobj_release(%p %lx):free %p\n",
dkprintf("devobj_release(%p %lx):free %p\n",
obj, handle, free_obj);
return;
}
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
struct devobj *obj = to_devobj(memobj);
@ -202,7 +212,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
ihk_mc_user_context_t ctx;
int ix;
kprintf("devobj_get_page(%p %lx,%lx,%d)\n", memobj, obj->handle, off, p2align);
dkprintf("devobj_get_page(%p %lx,%lx,%d)\n", memobj, obj->handle, off, p2align);
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
@ -210,7 +220,7 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
goto out;
}
ix = pgoff - obj->pfn_pgoff;
kprintf("ix: %ld\n", ix);
dkprintf("ix: %ld\n", ix);
memobj_lock(&obj->memobj);
pfn = obj->pfn_table[ix];
@ -230,12 +240,20 @@ kprintf("ix: %ld\n", ix);
if (pfn & PFN_PRESENT) {
/* convert remote physical into local physical */
kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT before %#lx\n", memobj, obj->handle, off, p2align, pfn);
dkprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT before %#lx\n", memobj, obj->handle, off, p2align, pfn);
attr = pfn & ~PFN_PFN;
/* TODO: do an arch dependent PTE to mapping flag conversion
* instead of this inline check, also, we rely on having the
* same PAT config as Linux here.. */
if ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD)) {
*flag |= VR_WRITE_COMBINED;
}
pfn = ihk_mc_map_memory(NULL, (pfn & PFN_PFN), PAGE_SIZE);
pfn &= PFN_PFN;
pfn |= attr;
kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->handle, off, p2align, pfn);
dkprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->handle, off, p2align, pfn);
}
memobj_lock(&obj->memobj);
@ -253,6 +271,6 @@ kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->
*physp = pfn & PFN_PFN;
out:
kprintf("devobj_get_page(%p %lx,%lx,%d): %d %lx\n", memobj, obj->handle, off, p2align, error, *physp);
dkprintf("devobj_get_page(%p %lx,%lx,%d): %d %lx\n", memobj, obj->handle, off, p2align, error, *physp);
return error;
}

View File

@ -26,7 +26,7 @@
#include <string.h>
#include <syscall.h>
#define dkprintf(...)
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
static ihk_spinlock_t fileobj_list_lock = SPIN_LOCK_UNLOCKED;
@ -46,6 +46,8 @@ static memobj_ref_func_t fileobj_ref;
static memobj_get_page_func_t fileobj_get_page;
static memobj_copy_page_func_t fileobj_copy_page;
static memobj_flush_page_func_t fileobj_flush_page;
static memobj_invalidate_page_func_t fileobj_invalidate_page;
static memobj_lookup_page_func_t fileobj_lookup_page;
static struct memobj_ops fileobj_ops = {
.release = &fileobj_release,
@ -53,6 +55,8 @@ static struct memobj_ops fileobj_ops = {
.get_page = &fileobj_get_page,
.copy_page = &fileobj_copy_page,
.flush_page = &fileobj_flush_page,
.invalidate_page = &fileobj_invalidate_page,
.lookup_page = &fileobj_lookup_page,
};
static struct fileobj *to_fileobj(struct memobj *memobj)
@ -383,9 +387,9 @@ out:
return;
}
static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp)
static int fileobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct process *proc = cpu_local_var(current);
struct thread *proc = cpu_local_var(current);
struct fileobj *obj = to_fileobj(memobj);
int error;
void *virt = NULL;
@ -577,3 +581,67 @@ static int fileobj_flush_page(struct memobj *memobj, uintptr_t phys,
memobj_lock(&obj->memobj);
return 0;
}
static int fileobj_invalidate_page(struct memobj *memobj, uintptr_t phys,
size_t pgsize)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
struct page *page;
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx)\n",
memobj, phys, pgsize);
if (!(page = phys_to_page(phys))
|| !(page = page_list_lookup(obj, page->offset))) {
error = 0;
goto out;
}
if (ihk_atomic_read(&page->count) == 1) {
if (page_unmap(page)) {
ihk_mc_free_pages(phys_to_virt(phys),
pgsize/PAGE_SIZE);
}
}
error = 0;
out:
dkprintf("fileobj_invalidate_page(%p,%#lx,%#lx):%d\n",
memobj, phys, pgsize, error);
return error;
}
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
uintptr_t phys = -1;
struct page *page;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
}
page = page_list_lookup(obj, off);
if (!page) {
error = -ENOENT;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
obj, off, p2align, physp, error, phys);
return error;
}

View File

@ -76,9 +76,11 @@
#ifdef DEBUG_PRINT_FUTEX
#define dkprintf kprintf
#else
#define dkprintf(...)
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
extern struct sigpending *hassigpending(struct thread *thread);
int futex_cmpxchg_enabled;
/**
@ -103,7 +105,7 @@ int futex_cmpxchg_enabled;
struct futex_q {
struct plist_node list;
struct process *task;
struct thread *task;
ihk_spinlock_t *lock_ptr;
union futex_key key;
union futex_key *requeue_pi_key;
@ -151,7 +153,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*/
static void get_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
@ -161,7 +163,7 @@ static void get_futex_key_refs(union futex_key *key)
*/
static void drop_futex_key_refs(union futex_key *key)
{
/* RIKEN: only !fshared futexes... */
/* RIKEN: no swapping in McKernel */
return;
}
/**
@ -183,6 +185,7 @@ static int
get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
unsigned long phys;
struct process_vm *mm = cpu_local_var(current)->vm;
/*
@ -201,15 +204,31 @@ get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key);
return 0;
}
/* RIKEN: No shared futex support... */
return -EFAULT;
key->both.offset |= FUT_OFF_MMSHARED;
retry_v2p:
/* Just use physical address of page, McKernel does not do swapping */
if (ihk_mc_pt_virt_to_phys(mm->address_space->page_table,
(void *)uaddr, &phys)) {
/* Check if we can fault in page */
if (page_fault_process_vm(mm, uaddr, PF_POPULATE | PF_WRITE | PF_USER)) {
kprintf("error: get_futex_key() virt to phys translation failed\n");
return -EFAULT;
}
goto retry_v2p;
}
key->shared.phys = (void *)phys;
key->shared.pgoff = 0;
return 0;
}
@ -232,7 +251,7 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
/* RIKEN: futexes are always on not swappable pages */
*dest = *from;
*dest = getint_user((int *)from);
return 0;
}
@ -243,7 +262,7 @@ static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
*/
static void wake_futex(struct futex_q *q)
{
struct process *p = q->task;
struct thread *p = q->task;
/*
* We set q->lock_ptr = NULL _before_ we wake up the task. If
@ -263,7 +282,8 @@ static void wake_futex(struct futex_q *q)
barrier();
q->lock_ptr = NULL;
sched_wakeup_process(p, PS_NORMAL);
dkprintf("wake_futex(): waking up tid %d\n", p->tid);
sched_wakeup_thread(p, PS_NORMAL);
}
/*
@ -658,23 +678,27 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
* queue_me() calls spin_unlock() upon completion, both serializing
* access to the hash list and forcing another memory barrier.
*/
xchg4(&(cpu_local_var(current)->ftn->status), PS_INTERRUPTIBLE);
xchg4(&(cpu_local_var(current)->status), PS_INTERRUPTIBLE);
queue_me(q, hb);
if (!plist_node_empty(&q->list)) {
/* RIKEN: use mcos timers */
if (timeout) {
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
time_remain = schedule_timeout(timeout);
}
else {
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
schedule();
time_remain = 0;
}
dkprintf("futex_wait_queue_me(): tid: %d woken up\n", cpu_local_var(current)->tid);
}
/* This does not need to be serialized */
cpu_local_var(current)->ftn->status = PS_RUNNING;
cpu_local_var(current)->status = PS_RUNNING;
return time_remain;
}
@ -775,6 +799,11 @@ retry:
if (timeout && !time_remain)
goto out_put_key;
if (hassigpending(cpu_local_var(current))) {
ret = -EINTR;
goto out_put_key;
}
/* RIKEN: no signals */
put_futex_key(fshared, &q.key);
goto retry;
@ -786,17 +815,10 @@ out:
}
int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
uint32_t *uaddr2, uint32_t val2, uint32_t val3)
uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared)
{
int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK;
int fshared = 0;
/* RIKEN: Assume address space private futexes.
if (!(op & FUTEX_PRIVATE_FLAG)) {
fshared = 1;
}
*/
clockrt = op & FUTEX_CLOCK_REALTIME;
if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
@ -817,8 +839,7 @@ int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break;
case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
0);
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0);
break;
case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);

View File

@ -28,20 +28,23 @@
#include <process.h>
#include <page.h>
#include <mman.h>
#include <init.h>
#include <kmalloc.h>
#include <sysfs.h>
//#define DEBUG_PRINT_HOST
#ifdef DEBUG_PRINT_HOST
#define dkprintf kprintf
#else
#define dkprintf(...)
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#endif
void check_mapping_for_proc(struct process *proc, unsigned long addr)
void check_mapping_for_proc(struct thread *thread, unsigned long addr)
{
unsigned long __phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table, (void*)addr, &__phys)) {
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table, (void*)addr, &__phys)) {
kprintf("check_map: no mapping for 0x%lX\n", addr);
}
else {
@ -58,7 +61,7 @@ void check_mapping_for_proc(struct process *proc, unsigned long addr)
* NOTE: if args, args_len, envs, envs_len are zero,
* the function constructs them based on the descriptor
*/
int prepare_process_ranges_args_envs(struct process *proc,
int prepare_process_ranges_args_envs(struct thread *thread,
struct program_load_desc *pn,
struct program_load_desc *p,
enum ihk_mc_pt_attribute attr,
@ -69,7 +72,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
unsigned long args_envs_p, args_envs_rp;
unsigned long s, e, up;
char **argv;
int i, n, argc, envc, args_envs_npages;
char **a;
int i, n, argc, envc, args_envs_npages, l;
char **env;
int range_npages;
void *up_v;
@ -77,15 +81,21 @@ int prepare_process_ranges_args_envs(struct process *proc,
unsigned long flags;
uintptr_t interp_obase = -1;
uintptr_t interp_nbase = -1;
size_t map_size;
struct process *proc = thread->proc;
struct process_vm *vm = proc->vm;
struct address_space *as = vm->address_space;
long aout_base;
int error;
n = p->num_sections;
aout_base = (pn->reloc)? vm->region.map_end: 0;
for (i = 0; i < n; i++) {
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
interp_obase = pn->sections[i].vaddr;
interp_obase -= (interp_obase % pn->interp_align);
interp_nbase = proc->vm->region.map_start;
interp_nbase = vm->region.map_end;
interp_nbase = (interp_nbase + pn->interp_align - 1)
& ~(pn->interp_align - 1);
}
@ -95,6 +105,10 @@ int prepare_process_ranges_args_envs(struct process *proc,
pn->sections[i].vaddr += interp_nbase;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
else{
pn->sections[i].vaddr += aout_base;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
s = (pn->sections[i].vaddr) & PAGE_MASK;
e = (pn->sections[i].vaddr + pn->sections[i].len
+ PAGE_SIZE - 1) & PAGE_MASK;
@ -110,7 +124,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
}
up = virt_to_phys(up_v);
if (add_process_memory_range(proc, s, e, up, flags, NULL, 0) != 0) {
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
PAGE_SHIFT) != 0) {
ihk_mc_free_pages(up_v, range_npages);
kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err;
@ -119,14 +134,14 @@ int prepare_process_ranges_args_envs(struct process *proc,
{
void *_virt = (void *)s;
unsigned long _phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &_phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
}
for (_virt = (void *)s + PAGE_SIZE;
(unsigned long)_virt < e; _virt += PAGE_SIZE) {
unsigned long __phys;
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(as->page_table,
_virt, &__phys)) {
kprintf("ERROR: no mapping for 0x%lX\n", _virt);
panic("mapping");
@ -145,23 +160,27 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* TODO: Maybe we need flag */
if (pn->sections[i].interp) {
proc->vm->region.map_end = e;
vm->region.map_end = e;
}
else if (i == 0) {
proc->vm->region.text_start = s;
proc->vm->region.text_end = e;
vm->region.text_start = s;
vm->region.text_end = e;
}
else if (i == 1) {
proc->vm->region.data_start = s;
proc->vm->region.data_end = e;
vm->region.data_start = s;
vm->region.data_end = e;
}
else {
proc->vm->region.data_start =
(s < proc->vm->region.data_start ?
s : proc->vm->region.data_start);
proc->vm->region.data_end =
(e > proc->vm->region.data_end ?
e : proc->vm->region.data_end);
vm->region.data_start =
(s < vm->region.data_start ?
s : vm->region.data_start);
vm->region.data_end =
(e > vm->region.data_end ?
e : vm->region.data_end);
}
if (aout_base) {
vm->region.map_end = e;
}
}
@ -169,32 +188,22 @@ int prepare_process_ranges_args_envs(struct process *proc,
pn->entry -= interp_obase;
pn->entry += interp_nbase;
p->entry = pn->entry;
ihk_mc_modify_user_context(proc->uctx, IHK_UCR_PROGRAM_COUNTER,
pn->entry);
ihk_mc_modify_user_context(thread->uctx,
IHK_UCR_PROGRAM_COUNTER,
pn->entry);
}
#if 1
/*
Fix for the problem where brk grows to hit .bss section
when using dynamically linked executables.
Test code resides in /home/takagi/project/mpich/src/brk_icc_mic.
This is because when using
ld.so (i.e. using shared objects), mckernel/kernel/host.c sets "brk" to
the end of .bss of ld.so (e.g. 0x21f000), and then ld.so places a
main-program after this (e.g. 0x400000), so "brk" will hit .bss
eventually.
*/
proc->vm->region.brk_start = proc->vm->region.brk_end =
(USER_END / 4) & LARGE_PAGE_MASK;
#else
proc->vm->region.brk_start = proc->vm->region.brk_end =
proc->vm->region.data_end;
#endif
if (aout_base) {
pn->at_phdr += aout_base;
pn->at_entry += aout_base;
}
vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
/* Map, copy and update args and envs */
flags = VR_PROT_READ | VR_PROT_WRITE;
flags |= VRFLAG_PROT_TO_MAXPROT(flags);
addr = proc->vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
addr = vm->region.map_start - PAGE_SIZE * SCD_RESERVED_COUNT;
e = addr + PAGE_SIZE * ARGENV_PAGE_COUNT;
if((args_envs = ihk_mc_alloc_pages(ARGENV_PAGE_COUNT, IHK_MC_AP_NOWAIT)) == NULL){
@ -203,8 +212,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
}
args_envs_p = virt_to_phys(args_envs);
if(add_process_memory_range(proc, addr, e, args_envs_p,
flags, NULL, 0) != 0){
if(add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0, PAGE_SHIFT) != 0){
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
kprintf("ERROR: adding memory range for args/envs\n");
goto err;
@ -217,7 +226,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* Only map remote address if it wasn't specified as an argument */
if (!args) {
// Map in remote physical addr of args and copy it
args_envs_npages = (p->args_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
map_size = ((uintptr_t)p->args & (PAGE_SIZE - 1)) + p->args_len;
args_envs_npages = (map_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
dkprintf("args_envs_npages: %d\n", args_envs_npages);
args_envs_rp = ihk_mc_map_memory(NULL,
(unsigned long)p->args, p->args_len);
@ -234,9 +244,9 @@ int prepare_process_ranges_args_envs(struct process *proc,
p->args_len = args_len;
}
dkprintf("args copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("args copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs, args_envs_r, p->args_len + 8);
memcpy_long(args_envs, args_envs_r, p->args_len + sizeof(long) - 1);
/* Only unmap remote address if it wasn't specified as an argument */
if (!args) {
@ -250,7 +260,8 @@ int prepare_process_ranges_args_envs(struct process *proc,
/* Only map remote address if it wasn't specified as an argument */
if (!envs) {
// Map in remote physical addr of envs and copy it after args
args_envs_npages = (p->envs_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
map_size = ((uintptr_t)p->envs & (PAGE_SIZE - 1)) + p->envs_len;
args_envs_npages = (map_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
dkprintf("args_envs_npages: %d\n", args_envs_npages);
args_envs_rp = ihk_mc_map_memory(NULL, (unsigned long)p->envs,
p->envs_len);
@ -268,9 +279,9 @@ int prepare_process_ranges_args_envs(struct process *proc,
p->envs_len = envs_len;
}
dkprintf("envs copy, nr: %d\n", *((int*)args_envs_r));
dkprintf("envs copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + 8);
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + sizeof(long) - 1);
/* Only map remote address if it wasn't specified as an argument */
if (!envs) {
@ -280,36 +291,52 @@ int prepare_process_ranges_args_envs(struct process *proc,
flush_tlb();
// Update variables
argc = *((int*)(args_envs));
argc = *((long *)(args_envs));
dkprintf("argc: %d\n", argc);
argv = (char **)(args_envs + (sizeof(int)));
while (*argv) {
char **_argv = argv;
dkprintf("%s\n", args_envs + (unsigned long)*argv);
*argv = (char *)addr + (unsigned long)*argv; // Process' address space!
argv = ++_argv;
argv = (char **)(args_envs + (sizeof(long)));
if(proc->saved_cmdline){
kfree(proc->saved_cmdline);
proc->saved_cmdline_len = 0;
}
for(a = argv, l = 0; *a; a++)
l += strlen(args_envs + (unsigned long)*a) + 1;
proc->saved_cmdline = kmalloc(p->args_len, IHK_MC_AP_NOWAIT);
if(!proc->saved_cmdline)
goto err;
proc->saved_cmdline_len = l;
for(a = argv, l = 0; *a; a++){
strcpy(proc->saved_cmdline + l, args_envs + (unsigned long)*a);
l += strlen(args_envs + (unsigned long)*a) + 1;
*a = (char *)addr + (unsigned long)*a; // Process' address space!
}
argv = (char **)(args_envs + (sizeof(int)));
envc = *((int*)(args_envs + p->args_len));
envc = *((long *)(args_envs + p->args_len));
dkprintf("envc: %d\n", envc);
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
while (*env) {
char **_env = env;
//dkprintf("%s\n", args_envs + p->args_len + (unsigned long)*env);
*env = (char *)addr + p->args_len + (unsigned long)*env;
env = ++_env;
}
env = (char **)(args_envs + p->args_len + sizeof(int));
env = (char **)(args_envs + p->args_len + sizeof(long));
dkprintf("env OK\n");
p->rprocess = (unsigned long)proc;
p->rpgtable = virt_to_phys(proc->vm->page_table);
if (init_process_stack(proc, pn, argc, argv, envc, env) != 0) {
if (pn->enable_vdso) {
error = arch_map_vdso(vm);
if (error) {
kprintf("ERROR: mapping vdso pages. %d\n", error);
goto err;
}
}
p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table);
if (init_process_stack(thread, pn, argc, argv, envc, env) != 0) {
goto err;
}
@ -328,7 +355,9 @@ static int process_msg_prepare_process(unsigned long rphys)
unsigned long phys, sz;
struct program_load_desc *p, *pn;
int npages, n;
struct thread *thread;
struct process *proc;
struct process_vm *vm;
enum ihk_mc_pt_attribute attr;
attr = PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_FOR_USER;
@ -355,45 +384,69 @@ static int process_msg_prepare_process(unsigned long rphys)
memcpy_long(pn, p, sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * n);
if((proc = create_process(p->entry)) == NULL){
if((thread = create_thread(p->entry)) == NULL){
ihk_mc_free(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
return -ENOMEM;
}
proc->ftn->pid = pn->pid;
proc->ftn->pgid = pn->pgid;
proc->vm->region.user_start = pn->user_start;
proc->vm->region.user_end = pn->user_end;
proc->vm->region.map_start = (USER_END / 3) & LARGE_PAGE_MASK;
proc->vm->region.map_end = proc->vm->region.map_start;
proc->rlimit_stack.rlim_cur = pn->rlimit_stack_cur;
proc->rlimit_stack.rlim_max = pn->rlimit_stack_max;
proc = thread->proc;
vm = thread->vm;
proc->pid = pn->pid;
proc->vm->address_space->pids[0] = pn->pid;
proc->pgid = pn->pgid;
proc->ruid = pn->cred[0];
proc->euid = pn->cred[1];
proc->suid = pn->cred[2];
proc->fsuid = pn->cred[3];
proc->rgid = pn->cred[4];
proc->egid = pn->cred[5];
proc->sgid = pn->cred[6];
proc->fsgid = pn->cred[7];
proc->termsig = SIGCHLD;
vm->region.user_start = pn->user_start;
vm->region.user_end = pn->user_end;
if(vm->region.user_end > USER_END)
vm->region.user_end = USER_END;
if(vm->region.user_start != 0UL ||
vm->region.user_end < TASK_UNMAPPED_BASE){
vm->region.map_start =
(vm->region.user_start +
(vm->region.user_end - vm->region.user_start) / 3) &
LARGE_PAGE_MASK;
}
else{
vm->region.map_start = TASK_UNMAPPED_BASE;
}
vm->region.map_end = vm->region.map_start;
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
/* TODO: Clear it at the proper timing */
cpu_local_var(scp).post_idx = 0;
if (prepare_process_ranges_args_envs(proc, pn, p, attr,
if (prepare_process_ranges_args_envs(thread, pn, p, attr,
NULL, 0, NULL, 0) != 0) {
kprintf("error: preparing process ranges, args, envs, stack\n");
goto err;
}
dkprintf("new process : %p [%d] / table : %p\n", proc, proc->pid,
proc->vm->page_table);
vm->address_space->page_table);
ihk_mc_free(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
flush_tlb();
return 0;
err:
ihk_mc_free(pn);
ihk_mc_unmap_virtual(p, npages, 1);
ihk_mc_unmap_memory(NULL, phys, sz);
free_process_memory(proc);
destroy_process(proc);
destroy_thread(thread);
return -ENOMEM;
}
@ -409,7 +462,7 @@ static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_para
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
{
struct ikc_scd_init_param *param = (void *)pphys;
struct ikc_scd_init_param *param = phys_to_virt(pphys);
struct syscall_params *lparam;
enum ihk_mc_pt_attribute attr;
@ -467,13 +520,40 @@ static void syscall_channel_send(struct ihk_ikc_channel_desc *c,
ihk_ikc_send(c, packet, 0);
}
extern unsigned long do_kill(int, int, int, struct siginfo *);
extern void settid(struct process *proc, int mode, int newcpuid, int oldcpuid);
extern unsigned long do_kill(struct thread *, int, int, int, struct siginfo *, int ptracecont);
extern void settid(struct thread *proc, int mode, int newcpuid, int oldcpuid);
extern void process_procfs_request(unsigned long rarg);
extern int memcheckall();
extern int freecheck(int runcount);
extern int runcount;
extern void terminate_host(int pid);
extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os)
@ -481,6 +561,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
struct ikc_scd_packet *packet = __packet;
struct ikc_scd_packet pckt;
int rc;
struct thread *thread;
struct process *proc;
struct mcctrl_signal {
int cond;
@ -490,6 +571,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
struct siginfo info;
} *sp, info;
unsigned long pp;
int cpuid;
switch (packet->msg) {
case SCD_MSG_INIT_CHANNEL_ACKED:
@ -521,13 +603,23 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
return 0;
case SCD_MSG_SCHEDULE_PROCESS:
cpuid = obtain_clone_cpuid();
if(cpuid == -1){
kprintf("No CPU available\n");
return -1;
}
dkprintf("SCD_MSG_SCHEDULE_PROCESS: %lx\n", packet->arg);
proc = (struct process *)packet->arg;
thread = (struct thread *)packet->arg;
proc = thread->proc;
settid(proc, 0, ihk_mc_get_processor_id(), -1);
runq_add_proc(proc, ihk_mc_get_processor_id());
settid(thread, 0, cpuid, -1);
proc->status = PS_RUNNING;
thread->status = PS_RUNNING;
chain_thread(thread);
chain_process(proc);
runq_add_thread(thread, cpuid);
//cpu_local_var(next) = (struct process *)packet->arg;
//cpu_local_var(next) = (struct thread *)packet->arg;
return 0;
case SCD_MSG_SEND_SIGNAL:
pp = ihk_mc_map_memory(NULL, packet->arg, sizeof(struct mcctrl_signal));
@ -541,12 +633,44 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
rc = do_kill(info.pid, info.tid, info.sig, &info.info);
rc = do_kill(NULL, info.pid, info.tid, info.sig, &info.info, 0);
kprintf("SCD_MSG_SEND_SIGNAL: do_kill(pid=%d, tid=%d, sig=%d)=%d\n", info.pid, info.tid, info.sig, rc);
return 0;
case SCD_MSG_PROCFS_REQUEST:
process_procfs_request(packet->arg);
return 0;
case SCD_MSG_CLEANUP_PROCESS:
dkprintf("SCD_MSG_CLEANUP_PROCESS pid=%d\n", packet->pid);
terminate_host(packet->pid);
return 0;
case SCD_MSG_DEBUG_LOG:
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg);
return 0;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
case SCD_MSG_SYSFS_REQ_RELEASE:
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
return 0;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
return 0;
}
return 0;
}

View File

@ -1,6 +1,8 @@
#ifndef _LINUX_AUXVEC_H
#define _LINUX_AUXVEC_H
#include <arch/auxvec.h>
/* Symbolic values for the entries in the auxiliary table
put on the initial stack */
#define AT_NULL 0 /* end of vector */

View File

@ -30,6 +30,7 @@ struct malloc_header {
#define CPU_STATUS_DISABLE (0)
#define CPU_STATUS_IDLE (1)
#define CPU_STATUS_RUNNING (2)
#define CPU_STATUS_RESERVED (3)
extern ihk_spinlock_t cpu_status_lock;
#define CPU_FLAG_NEED_RESCHED 0x1U
@ -38,14 +39,16 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var {
/* malloc */
struct malloc_header free_list;
ihk_spinlock_t free_list_lock;
struct malloc_header *remote_free_list;
struct process idle;
struct fork_tree_node idle_ftn;
struct thread idle;
struct process idle_proc;
struct process_vm idle_vm;
struct address_space idle_asp;
ihk_spinlock_t runq_lock;
struct process *current;
unsigned long runq_irqstate;
struct thread *current;
struct list_head runq;
size_t runq_len;
@ -56,6 +59,7 @@ struct cpu_local_var {
struct ihk_ikc_channel_desc *syscall_channel2;
struct syscall_params scp2;
struct ikc_scd_init_param iip2;
struct resource_set *resource_set;
int status;
int fs;
@ -66,6 +70,9 @@ struct cpu_local_var {
ihk_spinlock_t migq_lock;
struct list_head migq;
int in_interrupt;
int no_preempt;
int timer_enabled;
} __attribute__((aligned(64)));

View File

@ -99,6 +99,8 @@
#ifdef __KERNEL__
#define __user
/* We don't deal with uaccess at the moment, because x86 can access
* userspace directly, we rely on glibc and the app developers.
*/
@ -106,42 +108,14 @@
#include <arch/uaccess.h>
#endif
#include <asm.h>
#include <errno.h>
#define __user
#include <arch-futex.h>
#if 0
#include <arch/processor.h>
#include <arch/system.h>
#endif
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
{
int op = (encoded_op >> 28) & 7;
@ -206,28 +180,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
return ret;
}
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif // __KERNEL__
#endif // _ASM_X86_FUTEX_H
@ -241,13 +193,11 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
struct process_vm;
union futex_key {
#if 0
struct {
unsigned long pgoff;
struct inode *inode;
void *phys;
int offset;
} shared;
#endif
struct {
unsigned long address;
struct process_vm *mm;
@ -261,6 +211,7 @@ union futex_key {
};
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
#define FUT_OFF_MMSHARED 2
extern int futex_init(void);
@ -272,7 +223,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);

View File

@ -0,0 +1,23 @@
/**
* \file rlimit.h
* License details are found in the file LICENSE.
* \brief
* Kinds of resource limit
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef __GENERIC_RLIMIT_H
#define __GENERIC_RLIMIT_H
typedef uint64_t rlim_t;
struct rlimit {
rlim_t rlim_cur; /* Soft limit */
rlim_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#endif

View File

@ -14,7 +14,7 @@
#define INIT_H
extern void arch_init(void);
extern void kmsg_init(void);
extern void kmsg_init(int);
extern void mem_init(void);
extern void ikc_master_init(void);
extern void ap_init(void);
@ -28,6 +28,7 @@ extern void init_host_syscall_channel(void);
extern void init_host_syscall_channel2(void);
extern void sched_init(void);
extern void pc_ap_init(void);
extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name);

View File

@ -14,8 +14,18 @@
#define __HEADER_KMALLOC_H
#include <ihk/mm.h>
#include <cls.h>
#define kmalloc(size, flag) _kmalloc(size, flag, __FILE__, __LINE__)
void panic(const char *);
int kprintf(const char *format, ...);
#define kmalloc(size, flag) ({\
void *r = _kmalloc(size, flag, __FILE__, __LINE__);\
if(r == NULL){\
kprintf("kmalloc: out of memory %s:%d no_preempt=%d\n", __FILE__, __LINE__, cpu_local_var(no_preempt)); \
}\
r;\
})
#define kfree(ptr) _kfree(ptr, __FILE__, __LINE__)
#define memcheck(ptr, msg) _memcheck(ptr, msg, __FILE__, __LINE__, 0)
void *_kmalloc(int size, enum ihk_mc_ap_flag flag, char *file, int line);

View File

@ -16,6 +16,6 @@
void kputs(char *buf);
int kprintf(const char *format, ...);
void kmsg_init(void);
void kmsg_init(int);
#endif

View File

@ -92,7 +92,8 @@ futex(
uint64_t timeout,
uint32_t __user * uaddr2,
uint32_t val2,
uint32_t val3
uint32_t val3,
int fshared
);
extern long

View File

@ -18,11 +18,20 @@
#include <ihk/lock.h>
#include <errno.h>
#include <list.h>
#include <shm.h>
/* begin types.h */
typedef int32_t key_t;
typedef uint32_t uid_t;
typedef uint32_t gid_t;
typedef int64_t time_t;
typedef int32_t pid_t;
/* end types.h */
enum {
/* for memobj.flags */
MF_HAS_PAGER = 0x0001,
MF_SHMDT_OK = 0x0002,
MF_IS_REMOVABLE = 0x0004,
};
struct memobj {
@ -34,9 +43,11 @@ struct memobj {
typedef void memobj_release_func_t(struct memobj *obj);
typedef void memobj_ref_func_t(struct memobj *obj);
typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp);
typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag);
typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align);
typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_lookup_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag);
struct memobj_ops {
memobj_release_func_t * release;
@ -44,6 +55,8 @@ struct memobj_ops {
memobj_get_page_func_t * get_page;
memobj_copy_page_func_t * copy_page;
memobj_flush_page_func_t * flush_page;
memobj_invalidate_page_func_t * invalidate_page;
memobj_lookup_page_func_t * lookup_page;
};
static inline void memobj_release(struct memobj *obj)
@ -61,10 +74,10 @@ static inline void memobj_ref(struct memobj *obj)
}
static inline int memobj_get_page(struct memobj *obj, off_t off,
int p2align, uintptr_t *physp)
int p2align, uintptr_t *physp, unsigned long *pflag)
{
if (obj->ops->get_page) {
return (*obj->ops->get_page)(obj, off, p2align, physp);
return (*obj->ops->get_page)(obj, off, p2align, physp, pflag);
}
return -ENXIO;
}
@ -86,6 +99,24 @@ static inline int memobj_flush_page(struct memobj *obj, uintptr_t phys, size_t p
return 0;
}
static inline int memobj_invalidate_page(struct memobj *obj, uintptr_t phys,
size_t pgsize)
{
if (obj->ops->invalidate_page) {
return (*obj->ops->invalidate_page)(obj, phys, pgsize);
}
return 0;
}
static inline int memobj_lookup_page(struct memobj *obj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
if (obj->ops->lookup_page) {
return (*obj->ops->lookup_page)(obj, off, p2align, physp, pflag);
}
return -ENXIO;
}
static inline void memobj_lock(struct memobj *obj)
{
ihk_mc_spinlock_lock_noirq(&obj->lock);
@ -101,7 +132,13 @@ static inline int memobj_has_pager(struct memobj *obj)
return !!(obj->flags & MF_HAS_PAGER);
}
static inline int memobj_is_removable(struct memobj *obj)
{
return !!(obj->flags & MF_IS_REMOVABLE);
}
int fileobj_create(int fd, struct memobj **objp, int *maxprotp);
struct shmid_ds;
int shmobj_create(struct shmid_ds *ds, struct memobj **objp);
int zeroobj_create(struct memobj **objp);
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp);

View File

@ -5,6 +5,8 @@
* memory management declarations
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2013 Hitachi, Ltd.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
@ -13,6 +15,8 @@
#ifndef HEADER_MMAN_H
#define HEADER_MMAN_H
#include <arch/mman.h>
/*
* memory protection
*/
@ -32,16 +36,6 @@
#define MAP_PRIVATE 0x02
#define MAP_FIXED 0x10
#define MAP_ANONYMOUS 0x20
#define MAP_32BIT 0x40
#define MAP_GROWSDOWN 0x0100
#define MAP_DENYWRITE 0x0800
#define MAP_EXECUTABLE 0x1000
#define MAP_LOCKED 0x2000
#define MAP_NORESERVE 0x4000
#define MAP_POPULATE 0x8000
#define MAP_NONBLOCK 0x00010000
#define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000
/*
* memory advice
@ -69,4 +63,11 @@
#define MREMAP_MAYMOVE 0x01
#define MREMAP_FIXED 0x02
/*
* for msync()
*/
#define MS_ASYNC 0x01
#define MS_INVALIDATE 0x02
#define MS_SYNC 0x04
#endif /* HEADER_MMAN_H */

View File

@ -66,4 +66,6 @@ static inline int page_is_multi_mapped(struct page *page)
return (ihk_atomic_read(&page->count) > 1);
}
/* Should we take page faults on ANONYMOUS mappings? */
extern int anon_on_demand;
#endif

60
kernel/include/prio.h Normal file
View File

@ -0,0 +1,60 @@
#ifndef _SCHED_PRIO_H
#define _SCHED_PRIO_H
#define MAX_NICE 19
#define MIN_NICE -20
#define NICE_WIDTH (MAX_NICE - MIN_NICE + 1)
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH)
#define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2)
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO)
#define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
* Convert nice value [19,-20] to rlimit style value [1,40].
*/
static inline long nice_to_rlimit(long nice)
{
return (MAX_NICE - nice + 1);
}
/*
* Convert rlimit style value [1,40] to nice value [-20, 19].
*/
static inline long rlimit_to_nice(long prio)
{
return (MAX_NICE - prio + 1);
}
#endif /* _SCHED_PRIO_H */

View File

@ -21,12 +21,14 @@
#include <signal.h>
#include <memobj.h>
#include <affinity.h>
#include <syscall.h>
#define VR_NONE 0x0
#define VR_STACK 0x1
#define VR_RESERVED 0x2
#define VR_IO_NOCACHE 0x100
#define VR_REMOTE 0x200
#define VR_WRITE_COMBINED 0x400
#define VR_DEMAND_PAGING 0x1000
#define VR_PRIVATE 0x2000
#define VR_LOCKED 0x4000
@ -49,6 +51,7 @@
#define VRFLAG_PROT_TO_MAXPROT(vrflag) (((vrflag) & VR_PROT_MASK) << 4)
#define VRFLAG_MAXPROT_TO_PROT(vrflag) (((vrflag) & VR_MAXPROT_MASK) >> 4)
// struct process.status, struct thread.status
#define PS_RUNNING 0x1
#define PS_INTERRUPTIBLE 0x2
#define PS_UNINTERRUPTIBLE 0x4
@ -56,12 +59,19 @@
#define PS_EXITED 0x10
#define PS_STOPPED 0x20
#define PS_TRACED 0x40 /* Set to "not running" by a ptrace related event */
#define PS_STOPPING 0x80
#define PS_TRACING 0x100
#define PS_NORMAL (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)
// struct process.ptrace
#define PT_TRACED 0x80 /* The process is ptraced */
#define PT_TRACE_EXEC 0x100 /* Trace execve(2) */
#define PT_TRACE_SYSCALL_ENTER 0x200 /* Trace syscall enter */
#define PT_TRACE_SYSCALL_EXIT 0x400 /* Trace syscall exit */
#define PT_TRACE_SYSCALL_MASK (PT_TRACE_SYSCALL_ENTER | PT_TRACE_SYSCALL_EXIT)
// ptrace(2) request
#define PTRACE_TRACEME 0
#define PTRACE_PEEKTEXT 1
#define PTRACE_PEEKDATA 2
@ -90,6 +100,7 @@
#define PTRACE_GETREGSET 0x4204
#define PTRACE_SETREGSET 0x4205
// ptrace(2) options
#define PTRACE_O_TRACESYSGOOD 1
#define PTRACE_O_TRACEFORK 2
#define PTRACE_O_TRACEVFORK 4
@ -99,6 +110,7 @@
#define PTRACE_O_TRACEEXIT 0x40
#define PTRACE_O_MASK 0x7f
// ptrace(2) events
#define PTRACE_EVENT_FORK 1
#define PTRACE_EVENT_VFORK 2
#define PTRACE_EVENT_CLONE 3
@ -106,6 +118,8 @@
#define PTRACE_EVENT_VFORK_DONE 5
#define PTRACE_EVENT_EXIT 6
#define NT_X86_XSTATE 0x202 /* x86 XSAVE extended state */
#define SIGNAL_STOP_STOPPED 0x1 /* The process has been stopped by SIGSTOP */
#define SIGNAL_STOP_CONTINUED 0x2 /* The process has been resumed by SIGCONT */
@ -118,6 +132,11 @@
#define WNOWAIT 0x01000000 /* Don't reap, just poll status. */
#define __WCLONE 0x80000000
/* idtype */
#define P_ALL 0
#define P_PID 1
#define P_PGID 2
/* If WIFEXITED(STATUS), the low-order 8 bits of the status. */
#define __WEXITSTATUS(status) (((status) & 0xff00) >> 8)
@ -143,9 +162,70 @@
#define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25
#define NOPHYS ((uintptr_t)-1)
#include <waitq.h>
#include <futex.h>
#include <rlimit.h>
struct resource_set;
struct process_hash;
struct thread_hash;
struct address_space;
struct process;
struct thread;
struct process_vm;
struct vm_regions;
struct vm_range;
#define HASH_SIZE 73
struct resource_set {
struct list_head list;
char *path;
struct process_hash *process_hash;
struct thread_hash *thread_hash;
struct list_head phys_mem_list;
mcs_rwlock_lock_t phys_mem_lock;
cpu_set_t cpu_set;
mcs_rwlock_lock_t cpu_set_lock;
struct process *pid1;
};
extern struct list_head resource_set_list;
extern mcs_rwlock_lock_t resource_set_lock;
struct process_hash {
struct list_head list[HASH_SIZE];
mcs_rwlock_lock_t lock[HASH_SIZE];
};
static inline int
process_hash(int pid)
{
return pid % HASH_SIZE;
}
static inline int
thread_hash(int tid)
{
return tid % HASH_SIZE;
}
struct thread_hash {
struct list_head list[HASH_SIZE];
mcs_rwlock_lock_t lock[HASH_SIZE];
};
struct address_space {
struct page_table *page_table;
void *opt;
void (*free_cb)(struct address_space *, void *);
ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int nslots;
int pids[];
};
struct user_fpregs_struct
{
@ -212,7 +292,7 @@ struct user
unsigned long int u_debugreg [8];
};
#define AUXV_LEN 14
#define AUXV_LEN 18
struct vm_range {
struct list_head list;
@ -220,9 +300,12 @@ struct vm_range {
unsigned long flag;
struct memobj *memobj;
off_t objoff;
int pgshift; /* page size. 0 means THP */
int padding;
};
struct vm_regions {
unsigned long vm_start, vm_end;
unsigned long text_start, text_end;
unsigned long data_start, data_end;
unsigned long brk_start, brk_end;
@ -233,22 +316,32 @@ struct vm_regions {
struct process_vm;
struct sig_handler {
struct mckfd {
struct mckfd *next;
int fd;
long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
};
#define SFD_CLOEXEC 02000000
#define SFD_NONBLOCK 04000
struct sig_common {
ihk_spinlock_t lock;
ihk_atomic_t use;
ihk_atomic_t use;
struct k_sigaction action[_NSIG];
struct list_head sigpending;
};
struct sig_pending {
struct list_head list;
sigset_t sigmask;
siginfo_t info;
};
struct sig_shared {
ihk_spinlock_t lock;
ihk_atomic_t use;
struct list_head sigpending;
int ptracecont;
};
typedef void pgio_func_t(void *arg);
@ -257,142 +350,285 @@ typedef void pgio_func_t(void *arg);
* corresponding process exited due to references from the parent and/or
* children and is used for implementing wait/waitpid without having a
* special "init" process */
struct fork_tree_node {
ihk_spinlock_t lock;
ihk_atomic_t refcount;
int exit_status;
int status;
struct process *owner;
int pid;
int tid;
int pgid;
struct fork_tree_node *parent;
struct list_head children;
struct list_head siblings_list;
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
struct fork_tree_node *ppid_parent;
/* Manage ptraced processes in the separate list to make it easy to
restore the orginal parent child relationship when
performing PTRACE_DETACH */
struct list_head ptrace_children;
struct list_head ptrace_siblings_list;
struct waitq waitpid_q;
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
might divert while the threads are exiting by group_exit(). */
int group_exit_status;
/* Store ptrace flags.
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
* Other bits are for inner use of the McKernel.
*/
int ptrace;
/* Store event related to signal. For example,
it represents that the proceess has been resumed by SIGCONT. */
int signal_flags;
/* Store signal sent to parent when the process terminates. */
int termsig;
};
void hold_fork_tree_node(struct fork_tree_node *ftn);
void release_fork_tree_node(struct fork_tree_node *ftn);
struct process {
int cpu_id;
struct list_head hash_list;
mcs_rwlock_lock_t update_lock; // lock for parent, status, cpu time...
ihk_atomic_t refcount;
// process vm
struct process_vm *vm;
// threads and children
struct list_head threads_list;
mcs_rwlock_lock_t threads_lock; // lock for threads_list
/* The ptracing process behave as the parent of the ptraced process
after using PTRACE_ATTACH except getppid. So we save it here. */
struct process *parent;
struct process *ppid_parent;
struct list_head children_list;
struct list_head ptraced_children_list;
mcs_rwlock_lock_t children_lock; // lock for children_list and ptraced_children_list
struct list_head siblings_list; // lock parent
struct list_head ptraced_siblings_list; // lock ppid_parent
ihk_atomic_t refcount;
// process status and exit status
int status; // PS_RUNNING -> PS_EXITED -> PS_ZOMBIE
// | ^ ^
// | |---+ |
// V | |
// PS_STOPPING-)---+
// (PS_TRACING)| |
// | | |
// V +---- |
// PS_STOPPED -----+
// (PS_TRACED)
int exit_status;
/* Store exit_status for a group of threads when stopped by SIGSTOP.
exit_status can't be used because values of exit_status of threads
might divert while the threads are exiting by group_exit(). */
int group_exit_status;
/* Manage ptraced processes in the separate list to make it easy to
restore the orginal parent child relationship when
performing PTRACE_DETACH */
struct waitq waitpid_q;
// process info and credentials etc.
int pid;
int pgid;
int ruid;
int euid;
int suid;
int fsuid;
int rgid;
int egid;
int sgid;
int fsgid;
int execed;
int nohost;
int nowait;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long saved_auxv[AUXV_LEN];
char *saved_cmdline;
long saved_cmdline_len;
/* Store ptrace flags.
* The lower 8 bits are PTRACE_O_xxx of the PTRACE_SETOPTIONS request.
* Other bits are for inner use of the McKernel.
*/
int ptrace;
/* Store ptrace event message.
* PTRACE_O_xxx will store event message here.
* PTRACE_GETEVENTMSG will get from here.
*/
unsigned long ptrace_eventmsg;
/* Store event related to signal. For example,
it represents that the proceess has been resumed by SIGCONT. */
int signal_flags;
/* Store signal sent to parent when the process terminates. */
int termsig;
ihk_spinlock_t mckfd_lock;
struct mckfd *mckfd;
// cpu time (summary)
struct timespec stime;
struct timespec utime;
// cpu time (children)
struct timespec stime_children;
struct timespec utime_children;
long maxrss;
long maxrss_children;
// perf_event
int perf_status;
#define PP_NONE 0
#define PP_RESET 1
#define PP_COUNT 2
#define PP_STOP 3
struct mc_perf_event *monitoring_event;
};
void hold_thread(struct thread *ftn);
void release_thread(struct thread *ftn);
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
#define SCHED_DEADLINE 6
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
/*
* For the sched_{set,get}attr() calls
*/
#define SCHED_FLAG_RESET_ON_FORK 0x01
struct sched_param {
int sched_priority;
};
struct thread {
struct list_head hash_list;
// thread info
int cpu_id;
int tid;
int status; // PS_RUNNING -> PS_EXITED
// | ^ ^
// | | |
// V | |
// PS_STOPPED------+
// PS_TRACED
// PS_INTERRPUTIBLE
// PS_UNINTERRUPTIBLE
// process vm
struct process_vm *vm;
// context
ihk_mc_kernel_context_t ctx;
ihk_mc_user_context_t *uctx;
// sibling
struct process *proc;
struct list_head siblings_list; // lock process
// Runqueue list entry
struct list_head sched_list;
struct list_head sched_list; // lock cls
int sched_policy;
struct sched_param sched_param;
ihk_spinlock_t spin_sleep_lock;
int spin_sleep;
struct thread {
int *clear_child_tid;
unsigned long tlsblock_base, tlsblock_limit;
} thread;
ihk_atomic_t refcount;
volatile int sigevent;
int *clear_child_tid;
unsigned long tlsblock_base, tlsblock_limit;
// thread info
cpu_set_t cpu_set;
fp_regs_struct *fp_regs;
int in_syscall_offload;
// signal
struct sig_common *sigcommon;
sigset_t sigmask;
stack_t sigstack;
ihk_spinlock_t sigpendinglock;
struct list_head sigpending;
struct sig_shared *sigshared;
struct sig_handler *sighandler;
ihk_spinlock_t sigpendinglock;
volatile int sigevent;
struct rlimit rlimit_stack;
// gpio
pgio_func_t *pgio_fp;
void *pgio_arg;
struct fork_tree_node *ftn;
// for ptrace
unsigned long *ptrace_debugreg; /* debug registers for ptrace */
struct sig_pending *ptrace_recvsig;
struct sig_pending *ptrace_sendsig;
cpu_set_t cpu_set;
unsigned long saved_auxv[AUXV_LEN];
// cpu time
struct timespec stime;
struct timespec utime;
struct timespec btime;
int times_update;
int in_kernel;
struct user *userp;
// interval timers
int itimer_enabled;
struct itimerval itimer_virtual;
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
};
struct process_vm {
ihk_atomic_t refcount;
struct page_table *page_table;
struct address_space *address_space;
struct list_head vm_range_list;
struct vm_regions region;
struct process *owner_process; /* process that reside on the same page */
struct process *proc; /* process that reside on the same page */
void *opt;
void (*free_cb)(struct process_vm *, void *);
void *vdso_addr;
void *vvar_addr;
ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock;
ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock;
// to protect the followings:
// 1. addition of process "memory range" (extend_process_region, add_process_memory_range)
// 2. addition of process page table (allocate_pages, update_process_page_table)
// note that physical memory allocator (ihk_mc_alloc_pages, ihk_pagealloc_alloc)
// is protected by its own lock (see ihk/manycore/generic/page_alloc.c)
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
ihk_atomic_t refcount;
int exiting;
long currss;
};
static inline int has_cap_ipc_lock(struct thread *th)
{
/* CAP_IPC_LOCK (= 14) */
return !(th->proc->euid);
}
struct process *create_process(unsigned long user_pc);
struct process *clone_process(struct process *org, unsigned long pc,
static inline int has_cap_sys_admin(struct thread *th)
{
/* CAP_SYS_ADMIN (= 21) */
return !(th->proc->euid);
}
void hold_address_space(struct address_space *);
void release_address_space(struct address_space *);
struct thread *create_thread(unsigned long user_pc);
struct thread *clone_thread(struct thread *org, unsigned long pc,
unsigned long sp, int clone_flags);
void destroy_process(struct process *proc);
void hold_process(struct process *proc);
void release_process(struct process *proc);
void flush_process_memory(struct process *proc);
void free_process_memory(struct process *proc);
void free_process_memory_ranges(struct process *proc);
int populate_process_memory(struct process *proc, void *start, size_t len);
void destroy_thread(struct thread *thread);
void hold_thread(struct thread *thread);
void release_thread(struct thread *thread);
void flush_process_memory(struct process_vm *vm);
void hold_process_vm(struct process_vm *vm);
void release_process_vm(struct process_vm *vm);
void hold_process(struct process *);
void release_process(struct process *);
void free_process_memory_ranges(struct process_vm *vm);
int populate_process_memory(struct process_vm *vm, void *start, size_t len);
int add_process_memory_range(struct process *process,
int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t objoff);
int remove_process_memory_range(struct process *process, unsigned long start,
struct memobj *memobj, off_t objoff, int pgshift);
int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process *process,
int split_process_memory_range(struct process_vm *vm,
struct vm_range *range, uintptr_t addr, struct vm_range **splitp);
int join_process_memory_range(struct process *process, struct vm_range *surviving,
int join_process_memory_range(struct process_vm *vm, struct vm_range *surviving,
struct vm_range *merging);
int change_prot_process_memory_range(
struct process *process, struct vm_range *range,
struct process_vm *vm, struct vm_range *range,
unsigned long newflag);
int remap_process_memory_range(struct process_vm *vm, struct vm_range *range,
uintptr_t start, uintptr_t end, off_t off);
int sync_process_memory_range(struct process_vm *vm, struct vm_range *range,
uintptr_t start, uintptr_t end);
int invalidate_process_memory_range(struct process_vm *vm,
struct vm_range *range, uintptr_t start, uintptr_t end);
struct vm_range *lookup_process_memory_range(
struct process_vm *vm, uintptr_t start, uintptr_t end);
struct vm_range *next_process_memory_range(
@ -402,31 +638,42 @@ struct vm_range *previous_process_memory_range(
int extend_up_process_memory_range(struct process_vm *vm,
struct vm_range *range, uintptr_t newend);
int page_fault_process(struct process *proc, void *fault_addr, uint64_t reason);
int remove_process_region(struct process *proc,
int page_fault_process_vm(struct process_vm *fault_vm, void *fault_addr,
uint64_t reason);
int remove_process_region(struct process_vm *vm,
unsigned long start, unsigned long end);
struct program_load_desc;
int init_process_stack(struct process *process, struct program_load_desc *pn,
int init_process_stack(struct thread *thread, struct program_load_desc *pn,
int argc, char **argv,
int envc, char **env);
unsigned long extend_process_region(struct process *proc,
unsigned long extend_process_region(struct process_vm *vm,
unsigned long start, unsigned long end,
unsigned long address, unsigned long flag);
extern enum ihk_mc_pt_attribute arch_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
enum ihk_mc_pt_attribute common_vrflag_to_ptattr(unsigned long flag, uint64_t fault, pte_t *ptep);
void schedule(void);
void runq_add_proc(struct process *proc, int cpu_id);
void runq_del_proc(struct process *proc, int cpu_id);
int sched_wakeup_process(struct process *proc, int valid_states);
void runq_add_thread(struct thread *thread, int cpu_id);
void runq_del_thread(struct thread *thread, int cpu_id);
int sched_wakeup_thread(struct thread *thread, int valid_states);
void sched_request_migrate(int cpu_id, struct process *proc);
void sched_request_migrate(int cpu_id, struct thread *thread);
void check_need_resched(void);
void cpu_set(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock);
void cpu_clear(int cpu, cpu_set_t *cpu_set, ihk_spinlock_t *lock);
void cpu_clear_and_set(int c_cpu, int s_cpu,
cpu_set_t *cpu_set, ihk_spinlock_t *lock);
struct process *findthread_and_lock(int pid, int tid, ihk_spinlock_t **savelock, unsigned long *irqstate);
void process_unlock(void *savelock, unsigned long irqstate);
void release_cpuid(int cpuid);
struct thread *find_thread(int pid, int tid, struct mcs_rwlock_node_irqsave *lock);
void thread_unlock(struct thread *thread, struct mcs_rwlock_node_irqsave *lock);
struct process *find_process(int pid, struct mcs_rwlock_node_irqsave *lock);
void process_unlock(struct process *proc, struct mcs_rwlock_node_irqsave *lock);
void chain_process(struct process *);
void chain_thread(struct thread *);
void proc_init();
void set_timer();
#endif

View File

@ -3,7 +3,8 @@
* License details are found in the file LICENSE.
* \brief
* header file for System V shared memory
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 - 2015 RIKEN AICS
*/
/*
* HISTORY:
@ -12,38 +13,99 @@
#ifndef HEADER_SHM_H
#define HEADER_SHM_H
/* begin types.h */
typedef int32_t key_t;
typedef uint32_t uid_t;
typedef uint32_t gid_t;
typedef int64_t time_t;
typedef int32_t pid_t;
/* end types.h */
#include <list.h>
#include <memobj.h>
#include <arch/shm.h>
typedef uint64_t shmatt_t;
enum {
/* for key_t */
IPC_PRIVATE = 0,
struct ipc_perm {
key_t key;
uid_t uid;
gid_t gid;
uid_t cuid;
gid_t cgid;
uint16_t mode;
uint8_t padding[2];
uint16_t seq;
uint8_t padding2[22];
/* for shmflg */
IPC_CREAT = 01000,
IPC_EXCL = 02000,
SHM_HUGETLB = 04000,
SHM_RDONLY = 010000,
SHM_RND = 020000,
SHM_REMAP = 040000,
SHM_EXEC = 0100000,
/* for shm_mode */
SHM_DEST = 01000,
SHM_LOCKED = 02000,
/* for cmd of shmctl() */
IPC_RMID = 0,
IPC_SET = 1,
IPC_STAT = 2,
IPC_INFO = 3,
SHM_LOCK = 11,
SHM_UNLOCK = 12,
SHM_STAT = 13,
SHM_INFO = 14,
};
struct shmid_ds {
struct ipc_perm shm_perm;
size_t shm_segsz;
time_t shm_atime;
time_t shm_dtime;
time_t shm_ctime;
pid_t shm_cpid;
pid_t shm_lpid;
shmatt_t shm_nattch;
uint8_t padding[16];
struct shmlock_user;
struct shmobj {
struct memobj memobj; /* must be first */
int index;
int pgshift;
size_t real_segsz;
struct shmlock_user * user;
struct shmid_ds ds;
struct list_head page_list;
struct list_head chain; /* shmobj_list */
};
struct shminfo {
uint64_t shmmax;
uint64_t shmmin;
uint64_t shmmni;
uint64_t shmseg;
uint64_t shmall;
uint8_t padding[32];
};
struct shm_info {
int32_t used_ids;
uint8_t padding[4];
uint64_t shm_tot;
uint64_t shm_rss;
uint64_t shm_swp;
uint64_t swap_attempts;
uint64_t swap_successes;
};
struct shmlock_user {
uid_t ruid;
int padding;
size_t locked;
struct list_head chain;
};
extern ihk_spinlock_t shmlock_users_lock_body;
static inline void shmlock_users_lock(void)
{
ihk_mc_spinlock_lock_noirq(&shmlock_users_lock_body);
return;
}
static inline void shmlock_users_unlock(void)
{
ihk_mc_spinlock_unlock_noirq(&shmlock_users_lock_body);
return;
}
void shmobj_list_lock(void);
void shmobj_list_unlock(void);
int shmobj_create_indexed(struct shmid_ds *ds, struct shmobj **objp);
void shmobj_destroy(struct shmobj *obj);
void shmlock_user_free(struct shmlock_user *user);
int shmlock_user_get(uid_t ruid, struct shmlock_user **userp);
#endif /* HEADER_SHM_H */

View File

@ -13,8 +13,11 @@
#ifndef __HEADER_SYSCALL_H
#define __HEADER_SYSCALL_H
#include <ihk/atomic.h>
#include <ihk/context.h>
#include <ihk/memconst.h>
#include <rlimit.h>
#include <time.h>
#define NUM_SYSCALLS 255
@ -34,16 +37,41 @@
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
/* Cloning flags. */
# define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */
@ -88,13 +116,27 @@ struct user_desc {
unsigned int useable:1;
unsigned int lm:1;
};
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
};
struct program_image_section {
@ -109,6 +151,24 @@ struct program_image_section {
};
#define SHELL_PATH_MAX_LEN 1024
#define MCK_RLIM_MAX 20
#define MCK_RLIMIT_AS 0
#define MCK_RLIMIT_CORE 1
#define MCK_RLIMIT_CPU 2
#define MCK_RLIMIT_DATA 3
#define MCK_RLIMIT_FSIZE 4
#define MCK_RLIMIT_LOCKS 5
#define MCK_RLIMIT_MEMLOCK 6
#define MCK_RLIMIT_MSGQUEUE 7
#define MCK_RLIMIT_NICE 8
#define MCK_RLIMIT_NOFILE 9
#define MCK_RLIMIT_NPROC 10
#define MCK_RLIMIT_RSS 11
#define MCK_RLIMIT_RTPRIO 12
#define MCK_RLIMIT_RTTIME 13
#define MCK_RLIMIT_SIGPENDING 14
#define MCK_RLIMIT_STACK 15
struct program_load_desc {
int num_sections;
@ -118,6 +178,10 @@ struct program_load_desc {
int err;
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -132,8 +196,7 @@ struct program_load_desc {
unsigned long args_len;
char *envs;
unsigned long envs_len;
unsigned long rlimit_stack_cur;
unsigned long rlimit_stack_max;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
char shell_path[SHELL_PATH_MAX_LEN];
struct program_image_section sections[0];
@ -217,9 +280,9 @@ struct syscall_params {
SYSCALL_ARG_##a2(2); SYSCALL_ARG_##a3(3); \
SYSCALL_ARG_##a4(4); SYSCALL_ARG_##a5(5);
#define SYSCALL_FOOTER return do_syscall(&request, ctx, ihk_mc_get_processor_id(), 0)
#define SYSCALL_FOOTER return do_syscall(&request, ihk_mc_get_processor_id(), 0)
extern long do_syscall(struct syscall_request *req, ihk_mc_user_context_t *ctx, int cpu, int pid);
extern long do_syscall(struct syscall_request *req, int cpu, int pid);
extern int obtain_clone_cpuid();
extern long syscall_generic_forwarding(int n, ihk_mc_user_context_t *ctx);
@ -254,6 +317,7 @@ struct procfs_read {
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
@ -263,4 +327,84 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
#define RUSAGE_SELF 0
#define RUSAGE_CHILDREN -1
#define RUSAGE_THREAD 1
struct rusage {
struct timeval ru_utime;
struct timeval ru_stime;
long ru_maxrss;
long ru_ixrss;
long ru_idrss;
long ru_isrss;
long ru_minflt;
long ru_majflt;
long ru_nswap;
long ru_inblock;
long ru_oublock;
long ru_msgsnd;
long ru_msgrcv;
long ru_nsignals;
long ru_nvcsw;
long ru_nivcsw;
};
extern void terminate(int, int);
struct tod_data_s {
int8_t do_local;
int8_t padding[7];
ihk_atomic64_t version;
unsigned long clocks_per_sec;
struct timespec origin; /* realtime when tsc=0 */
};
extern struct tod_data_s tod_data; /* residing in arch-dependent file */
void reset_cputime();
void set_cputime(int mode);
intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd,
off_t off0);
void clear_host_pte(uintptr_t addr, size_t len);
typedef int32_t key_t;
int do_shmget(key_t key, size_t size, int shmflg);
struct process_vm;
int arch_map_vdso(struct process_vm *vm); /* arch dependent */
int arch_setup_vdso(void);
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
#if 0
wait_queue_head_t wq;
#endif
};
#endif

71
kernel/include/sysfs.h Normal file
View File

@ -0,0 +1,71 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_H
#define MCKERNEL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_ops {
ssize_t (*show)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
void (*release)(struct sysfs_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfs_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
const char *fmt, ...);
extern int sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...);
extern int sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...);
extern int sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...);
extern int sysfs_unlinkf(int flags, const char *fmt, ...);
extern void sysfs_init(void);
struct ihk_ikc_channel_desc;
extern void sysfss_packet_handler(struct ihk_ikc_channel_desc *ch, int msg,
int error, long arg1, long arg2, long arg3);
#endif /* MCKERNEL_SYSFS_H */

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -19,6 +19,12 @@
#ifndef __TIME_H
#define __TIME_H
#define NS_PER_SEC 1000000000UL
#define CLOCK_REALTIME 0
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
typedef long int __time_t;
/* POSIX.1b structure for a time value. This is like a `struct timeval' but
@ -47,5 +53,72 @@ struct timezone
int tz_dsttime; /* Nonzero if DST is ever in effect. */
};
#define ITIMER_REAL 0
#define ITIMER_VIRTUAL 1
#define ITIMER_PROF 2
struct itimerval {
struct timeval it_interval;
struct timeval it_value;
};
static inline void
ts_add(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_nsec += bts->tv_nsec;
while(ats->tv_nsec >= 1000000000){
ats->tv_sec++;
ats->tv_nsec -= 1000000000;
}
}
static inline void
ts_sub(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_nsec -= bts->tv_nsec;
while(ats->tv_nsec < 0){
ats->tv_sec--;
ats->tv_nsec += 1000000000;
}
}
static inline void
tv_add(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_usec += bts->tv_usec;
while(ats->tv_usec >= 1000000){
ats->tv_sec++;
ats->tv_usec -= 1000000;
}
}
static inline void
tv_sub(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_usec -= bts->tv_usec;
while(ats->tv_usec < 0){
ats->tv_sec--;
ats->tv_usec += 1000000;
}
}
static inline void
tv_to_ts(struct timespec *ats, const struct timeval *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_nsec = bts->tv_usec * 1000;
}
static inline void
ts_to_tv(struct timeval *ats, const struct timespec *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_usec = bts->tv_nsec / 1000;
}
#endif // __TIME_H

View File

@ -36,7 +36,7 @@ struct timer {
uint64_t timeout;
struct waitq processes;
struct list_head list;
struct process *proc;
struct thread *thread;
};
uint64_t schedule_timeout(uint64_t timeout);

View File

@ -19,7 +19,7 @@
#include <ihk/lock.h>
#include <list.h>
struct process;
struct thread;
struct waitq_entry;
typedef int (*waitq_func_t)(struct waitq_entry *wait, unsigned mode,
@ -58,7 +58,7 @@ typedef struct waitq_entry {
}
extern void waitq_init(waitq_t *waitq);
extern void waitq_init_entry(waitq_entry_t *entry, struct process *proc);
extern void waitq_init_entry(waitq_entry_t *entry, struct thread *proc);
extern int waitq_active(waitq_t *waitq);
extern void waitq_add_entry(waitq_t *waitq, waitq_entry_t *entry);
extern void waitq_add_entry_locked(waitq_t *waitq, waitq_entry_t *entry);

Some files were not shown because too many files have changed in this diff Show More