Compare commits

..

182 Commits
1.0.0 ... 1.1.0

Author SHA1 Message Date
d90900b6e6 Make executor code include executer/config.h
Make the code "executer/kernel/mcctrl/arch/x86_64/archdeps.c"
to include "executer/config.h" instead of
non-existent "executer/kernel/mcctrl/config.h".
2016-06-09 18:40:39 +09:00
6d9a88e9f4 binfmt_mcexec: support post-K specification 2016-06-08 09:53:39 +09:00
d0ee60f9e3 mcoverlayfs: supported only Linux kernel 4.0 2016-06-03 18:36:55 +09:00
14ec92518e KVM support: detect KVM and avoid touching unimplemented MSRs 2016-05-26 01:11:08 +09:00
435e2bdeb4 support for Linux 4.6: use get_user_pages_remote() 2016-05-24 09:39:04 +09:00
f06d8041e3 don't send SIGCONT when sending SIGSTOP derived from PTRACE_ATTACH
refs #747
2016-05-19 10:54:12 +09:00
9b35eaca42 remote_flush_tlb_cpumask() dead locking
refs #728
2016-05-10 14:02:25 +09:00
130b1f4327 update PAPI support. other process and child process monitoring. 2016-04-26 19:01:47 +09:00
921280f85c Docker support: use task_XX_vnr() functions for accessing correct namespace 2016-04-21 09:59:49 -07:00
d4a0b32f06 support large pages 2016-04-21 23:22:55 +09:00
b3bec32e99 update_process_page_table: refactor 2016-04-21 23:22:55 +09:00
2048980820 remove ihk_mc_pt_alloc_range() 2016-04-21 23:22:54 +09:00
176f6d23a9 ihk_mc_pt_virt_to_pagemap: refactor 2016-04-21 23:22:54 +09:00
328175547f Revert "fix REQ-37: remap_one_page: remove to check page size"
This reverts commit 6790126a23.

- reverted commit should remove a 'pgsize' check in remap_one_page()
  instead of a 'pgsize' check in pte_make_fileoff().
- In IA-32e, PTE format varies with page size. Therefore 'pgsize'
  parameter of pte_make_fileoff() is preferable.
2016-04-21 23:22:54 +09:00
e2e0fad849 arch_clear_host_user_space: set zero to args[2]
to avoid duplicated per_proc_list entry.
2016-04-21 23:22:54 +09:00
397bf3f4a6 wait_zombie: don't wait attached process
refs #726
2016-04-21 20:28:36 +09:00
aa77228453 resupport ptrace(PTRACE_ATTACH)
refs #733
2016-04-21 20:13:27 +09:00
82cb8f95ed update PAPI support. 2016-04-18 13:07:45 +09:00
3f2b4e7282 do_wait: unlink child from children_list if child terminated
refs #724
2016-04-14 10:25:12 +09:00
d6784bb4a5 update auto-generated files 2016-04-11 22:25:53 +09:00
1bb948f43b hwloc support 2016-04-11 22:25:27 +09:00
2a1823d52c vdso: set enable bit of pvti_msr 2016-04-11 22:20:39 +09:00
89943dc5ba vdso: set physical address at pvti_msr 2016-04-11 22:20:39 +09:00
fceb02a44a vdso: add zero clear for pvti 2016-04-11 22:20:38 +09:00
7298d8e179 vdso: correct pvti array element type
struct pvclock_vsyscall_time_info <-- struct pvclock_vcpu_time_info
2016-04-11 22:20:38 +09:00
6f32544dde vdso: add static cast 2016-04-11 22:20:38 +09:00
10d248b3cc mcexec: include config.h 2016-04-11 22:20:38 +09:00
fb32120659 make mcoverlayfs optional (default: enabled) 2016-04-02 15:43:35 -04:00
73de203c16 update auto-generated files 2016-03-28 22:57:45 +09:00
41bb2ab5e6 support vdso which borrows clocksource from linux 2016-03-28 22:57:44 +09:00
a587c8f5e5 x86: encode cpu# in IA32_TSC_AUX and size of GDTe#15 2016-03-28 22:57:44 +09:00
0c53a5ca35 add NOPHYS which means no physical memory 2016-03-28 22:57:44 +09:00
c760a01a79 add pte_get_attr() 2016-03-28 22:57:44 +09:00
a2c29e8abf correct the value of tod_data.origin
tod_data.origin should hold a time when TSC is zero.
2016-03-28 22:57:39 +09:00
18add6a9bd shmctl(IPC_RMID): fix wrong owner/creator checking (revised)
Don't check owner/creator of the segment in case of superuser.
2016-03-28 16:02:24 +09:00
a083e6c2bf Revert "shmctl(IPC_RMID): fix wrong owner/creator checking"
This reverts commit 8b5b075f4c.

The reverted commit modifies IPC_SET instead of IPC_RMID.
2016-03-28 16:00:39 +09:00
a2548f5421 Revert "fix REQ-42"
This reverts commit 4a0682bbc1.

The reverted commit appears to be wrong, for example:
- arch_range_check()'s arguments and parameters are mismatch.
- arch_range_check() implementation is not checking range.

Conflicts:
	kernel/syscall.c
2016-03-28 13:51:57 +09:00
6790126a23 fix REQ-37: remap_one_page: remove to check page size 2016-03-27 14:05:00 +09:00
1195549f41 fix REQ-19: some syscalls change how to access user space 2016-03-27 11:43:53 +09:00
b0096a2740 fix REQ-51 2016-03-26 12:23:51 +09:00
a11479eba8 fix REQ-48 2016-03-25 13:05:53 +09:00
12eaea401e fix REQ-46 2016-03-25 12:59:18 +09:00
31595b7409 fix REQ-43 2016-03-25 12:57:31 +09:00
4a0682bbc1 fix REQ-42 2016-03-24 19:14:50 +09:00
932a287437 fix REQ-40 2016-03-24 13:46:13 +09:00
670741ae40 fix REQ-39 2016-03-24 13:45:15 +09:00
70b27e06ff eclair: change default kernel to ./mckernel.img 2016-03-23 20:00:57 +09:00
4c38ddb623 update auto-generated files 2016-03-23 20:00:57 +09:00
6f00ddced6 move eclair from ihk repository 2016-03-23 20:00:57 +09:00
c0eecd63c9 update auto-generated files 2016-03-23 20:00:57 +09:00
1fd0b03e78 move config.h.in
from executer/kernel/mcctrl/config.h.in
to   executer/config.h.in
2016-03-23 20:00:57 +09:00
6c59de9300 expand AC_PROT_CC only once 2016-03-23 20:00:57 +09:00
b1309a5d53 map PIE at map_end instead of at user_start 2016-03-23 19:14:28 +09:00
489cd6d1a2 refactor prepare_process_ranges_args_envs() 2016-03-23 19:14:28 +09:00
c9cc4330c8 mincore: take into account pages cached in memobj 2016-03-23 19:14:28 +09:00
604f846cd2 mincore: check [start..start+len) is in user region 2016-03-23 19:14:28 +09:00
e939cf6862 mincore: cosmetic changes 2016-03-23 19:14:28 +09:00
72f2e5ebe0 shmobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
bd7dddd415 fileobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
fbd9dc878b memobj: add lookup_page method 2016-03-23 19:14:28 +09:00
d6c51ff997 treat memory devices as regular files,
to enable processes to mmap() /dev/zero
2016-03-23 19:14:27 +09:00
86ac51157c add error checks to shmctl(SHM_UNLOCK) 2016-03-23 19:14:27 +09:00
b73fa2b972 add error checks to shmctl(SHM_LOCK) 2016-03-23 19:14:27 +09:00
798f69bceb add has_cap_ipc_lock() 2016-03-23 19:14:27 +09:00
e8be52a1ff shm: trace the amount of locked segment per user 2016-03-23 19:14:27 +09:00
8b5b075f4c shmctl(IPC_RMID): fix wrong owner/creator checking
Don't check owner/creator of the segment in case of superuser.
2016-03-23 19:14:27 +09:00
b214fc278a add has_cap_sys_admin() 2016-03-23 19:14:27 +09:00
b3ae7f46bd add rlim_t (a type of rlim_cur and rlim_max) 2016-03-23 19:14:27 +09:00
48167d3223 shmget: add "shmflg" checks for SHM_HUGE* 2016-03-23 19:14:27 +09:00
d65135c040 move sys_shmget() into arch-dependent code 2016-03-23 19:14:27 +09:00
1761acc4c3 eliminate geteuid(), getegid() and getpid() 2016-03-23 19:04:32 +09:00
d4d93df032 mmap: add "flags" checks for MAP_HUGE* 2016-03-23 19:04:32 +09:00
261bddb999 add a member pgshift into struct vm_range
pgshift indicates a page size in the range.
2016-03-23 19:04:32 +09:00
1a3bc851af mprotect: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
15f572ef9c mmap: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
81690c5b5a mmap: cosmetic changes 2016-03-23 19:04:32 +09:00
832c0f9afd refactor copy_user_ranges() 2016-03-23 19:04:32 +09:00
f92cac7751 add type casting to the argument of getlong_user() 2016-03-23 19:04:32 +09:00
e74eb1dd51 add some prototypes to <memory.h> 2016-03-23 19:04:32 +09:00
8f7b9072ea refactor some copyin/copyout functions
- copy_from_user()
- getlong_user()
- getint_user()
- copy_to_user()
- setlong_user()
- setint_user()
2016-03-23 19:04:32 +09:00
4595aa3079 pte_visitor_t(): change "pgsize" into "pgshift" 2016-03-23 19:04:32 +09:00
807d294ac4 signalfd4: fix initialize 2016-06-03 20:58:02 +09:00
c947dd0d49 sysfs: support /sys/devices/system/cpu/online 2016-03-22 20:25:34 +09:00
d192e6c0fe modify PAPI support 2016-03-22 15:52:59 +09:00
7dbbcb362f add PAPI support 2016-03-22 15:27:19 +09:00
593cf98015 add ACSL annotation 2016-03-16 15:42:32 +09:00
8dd9f5ef3f support profil 2016-03-12 16:47:19 +09:00
0eaf058a4f mcexec: -lrt to Makefile.in for supporting clock_gettime() on SUSE 2016-03-12 05:24:14 +09:00
1aac2c8e23 add CPU timer initialization (refs #402)
There is no actual initialization in x86 now.
The initialization rely on hardware reset and Linux initialization.
2016-03-11 19:20:37 +09:00
70e8dd7979 remove initialization of TSC (refs #362) 2016-03-11 19:17:29 +09:00
eb0700359b fix REQ-36 2016-03-10 10:33:38 +09:00
3f16a9443e ptrace_report_signal: save debug regs before to send SIGCHLD to tracer 2016-03-09 22:29:51 +09:00
bf0cf0a346 fix REQ-31 2016-03-08 15:19:03 +09:00
14b868907b fix REQ-27 2016-03-07 18:52:08 +09:00
dbc778e4fa support getrusage (work in progress) 2016-03-07 17:06:44 +09:00
7fac03d4de sysfs: support /sys/devices/system/cpu/offline,online,possible,present 2016-03-04 13:48:06 +09:00
26c0180374 rwlock_reader_lock: fix lock list jammed up 2016-03-03 22:47:48 +09:00
8ebb3a4231 schedule: migration free last thread if terminated 2016-03-03 22:44:44 +09:00
f1f1ba9c8c mcs_rwlock_reader_lock: temporary fix 2016-03-01 19:11:42 +09:00
6ce00b5f0f sysfs: samples of snooping ops 2016-02-29 19:59:04 +09:00
4ec0e02a89 sysfs: add snooping ops 2016-02-29 19:23:01 +09:00
8f9192ac36 mcctrl: workaround for out-of-tree build (2/2)
- update auto-generated file
2016-02-29 19:18:08 +09:00
80ce123ab6 mcctrl: workaround for out-of-tree build (1/2) 2016-02-29 19:18:08 +09:00
1dc8513cd3 fix REQ-20 2016-02-26 16:18:30 +09:00
b0054643c0 REQ-18 2016-02-26 16:17:23 +09:00
972ff73ecf mcexec: fix readlink
refs #692
2016-02-25 16:08:42 +09:00
1f8a859b47 mcctrl: update auto-generated files 2016-02-24 21:34:48 +09:00
2601d8a36f mcctrl: use zap_page_range() instead of madvise() 2016-02-24 21:34:48 +09:00
a713c2fcaa fix REQ-16 2016-02-24 20:58:07 +09:00
c4c5e435cc fix REQ-12 2016-02-24 20:57:45 +09:00
853b56c784 mcreboot-smp-x86.sh: add mount to ceate /tmp/mcos/linux_proc from /proc 2016-02-24 19:24:37 +09:00
863a5c5e5f fix REQ-2, REQ-6, REQ-8 2016-02-23 16:32:17 +09:00
ebce1cb031 Merge branch 'master' of postpeta.pccluster.org:mckernel 2016-02-22 13:34:00 +09:00
fff7744907 mcklogd support 2016-02-22 13:32:20 +09:00
27c3ed7e96 remove debug print 2016-02-21 15:17:42 +09:00
e2b28da32f signal handler support gdb stepi command 2016-02-21 14:55:34 +09:00
2c50b716fd support setitimer/getitimer 2016-02-19 15:25:05 +09:00
307b2b8da5 clock_gettime: support clock_id CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID 2016-02-18 17:43:13 +09:00
eba2be8a35 support times 2016-02-18 13:14:18 +09:00
a997af71be support tkill
refs #664
2016-02-17 12:48:12 +09:00
e7c37b8000 mcreboot-smp-x86.sh: fix Failed to mount /sys/devices/virtual/mcos/mcos0/sys 2016-02-16 16:05:40 +09:00
8c40f94aa8 /proc/<PID>/mem: support read/write 2016-02-16 13:21:29 +09:00
da13bd408a mcexec: add to initialize some structures (REQ-56)
refs #718
2016-02-15 18:20:58 +09:00
c328d26b8d procfs(/proc/<PID>/task/<TID>/stat): fix memory corruption
refs #722
2016-02-15 15:10:00 +09:00
6cda6792a9 process_msg_init_acked: don't use PA 2016-02-14 22:47:52 +09:00
2d3fda1d0b flatten_strings: fix align (REQ-1) 2016-02-14 22:36:58 +09:00
5d43c135db procfs: (temporary fix) unsupported files are closed 2016-02-10 17:10:54 +09:00
a866192db7 refactoring /proc 2016-02-10 08:11:02 +09:00
c0cc6ac6db Add skeleton for perf_event_open. 2016-02-09 14:54:53 +09:00
14c5bc08c2 mcexec: check Linux version from actual kernel tree instead of system wide include 2016-02-09 14:07:08 +09:00
7f01d273d0 mcctrl: fix out-of-tree build (not finding config.h) 2016-02-09 12:45:58 +09:00
137e0a799c mcexec: unshare and mount request through mcctrl 2016-02-08 16:27:03 +09:00
f214ff1b57 mcctrl: add MCEXEC_UP_SYS_MOUNT, MCEXEC_UP_SYS_UNSHARE 2016-02-08 16:00:52 +09:00
0ce698eb1f mcexec: support for /sys mounted by mcoverlayfs 2016-02-08 11:36:03 +09:00
e601248bdc procfs: fix mcos%d/PID/auxv size 2016-02-08 09:38:27 +09:00
d8eeab9b89 mcoverlayfs: enable out of tree compilation 2016-02-01 00:35:53 +09:00
fdf031ac16 procfs: chown procfs entries (temporary hack)
refs #651
refs #699
2016-01-28 16:29:46 +09:00
1ffe740153 sysfs sample 2016-01-26 18:08:25 +09:00
72968d613e support sysfs interface for mcctrl 2016-01-26 18:08:25 +09:00
2e98f875c3 sysfs: attempt to remove empty directories only 2016-01-26 18:08:25 +09:00
a6cb9a6b93 sysfs: lookup_i(): refactoring 2016-01-26 18:08:25 +09:00
da0a91b9f7 mcctrl: denote full path in /proc/PID/exe 2016-01-26 16:21:52 +09:00
f093786bec x86: populating PML4e and PDPTe is now lock-free 2016-01-25 09:17:06 +09:00
368f155328 sigaction: support SA_NODEFER
refs #698
2016-01-21 18:48:10 +09:00
425f920013 mcctrl: delete procfs entries recursively to avoid leaking 2016-01-21 18:15:59 +09:00
dbddf37579 set termsig to mcexec spawned process 2016-01-21 12:08:47 +09:00
fa7a5ccd11 support /proc/self/exe (needed for GDB to attach to an existing process) 2016-01-19 18:23:02 +09:00
172bf0a389 sched_setaffinity: add permission check 2016-01-15 12:05:18 +09:00
9bafd166e3 futex: support FUTEX_CLOCK_REALTIME 2016-01-14 16:18:49 +09:00
2e31b8abd1 clock_gettime: clock_id != CLOCK_REALTIME -> offload to linux 2016-01-13 14:04:06 +09:00
a42ee00101 NR_execve: initialize local variable 'shell'
refs #696
2016-01-13 11:16:19 +09:00
f6935b0869 ptrace_setsiginfo: update recieved siginfo 2016-01-11 17:37:29 +09:00
03a7763a5e ptrace_conf: set received siginfo to default siginfo 2016-01-11 17:10:30 +09:00
3a2f7b0106 clone: support CLONE_PARENT 2016-01-11 16:49:02 +09:00
2819ec2197 fix extra copy which might cause page faults 2016-01-06 21:12:57 +09:00
f7d81a9281 fix typo 2016-01-06 21:12:57 +09:00
914faf042d add missing kfree() for channel lookup table 2016-01-06 21:12:57 +09:00
75c6a94839 delete struct member 'type' from address_space structure 2016-01-06 20:17:00 +09:00
f7b5b48266 support x2apic 2016-01-06 13:53:02 +09:00
f9bd83c726 ptrace: fix PTRACE_GETREGSET, PTRACE_SETREGSET bug
refs #608
2015-12-28 19:45:50 +09:00
edc275ce4f delete free_list_lock 2015-12-28 11:31:42 +09:00
d00ea61d1a ptrace_wakeup_sig: fix thread lock 2015-12-28 10:33:07 +09:00
01117e92c9 append file path to symlink if link path is absolute
refs #643
2015-12-25 15:50:39 +09:00
d477096cb0 getrlimit, setrlimit: offload to linux when an unknown parameter was specified
refs #660
2015-12-25 15:35:33 +09:00
f44ddfa3b3 support sigtimedwait 2015-12-24 12:35:45 +09:00
e0acd254b1 do_process_vm_read_writev: use process hash for remote process search 2015-12-22 09:47:00 +09:00
d0507f7e9f process_read/write_vm(): fix LTP bugs 2015-12-18 15:58:51 +09:00
0f8b2aba22 reset signal handlers when execve called 2015-12-18 12:46:53 +09:00
7e5c7445e2 fix ptrace_detach bug
refs #662
2015-12-16 17:41:57 +09:00
a055fb525d sysfs sample 2015-12-16 13:42:30 +09:00
8cb72df663 support McKernel's sysfs tree 2015-12-16 13:42:30 +09:00
e805249651 add strrchr() 2015-12-16 13:42:30 +09:00
06a7889e1f chown root mcexec 2015-12-15 16:22:14 +09:00
20deed09f0 mcexec: support for /proc mounted by mcoverlayfs 2015-12-14 14:47:05 +09:00
bb81f84709 support PIE executable for PVAS 2015-12-14 11:05:28 +09:00
5c1dad1660 GDB: async-shell.exp
refs #650
2015-11-26 17:07:13 +09:00
7f2220b8e9 set '\0' termination to readlink result.
refs #643
2015-11-26 16:58:15 +09:00
65dda3f24e mcoverlayfs: support mount options(nocopyupw, nofscheck) 2015-11-25 15:34:58 +09:00
544971d665 modify for PVAS 2015-11-25 14:27:20 +09:00
dbddab4356 mcoverlayfs: add overlayfs of the original(kernel 4.0.9) 2015-11-25 13:23:49 +09:00
12eb8a9bb0 mcctrl: move mcctrl to executer/kernel/mcctrl 2015-11-24 15:42:04 +09:00
828a3ea57a futex(): support for cross address-space futexes 2015-11-24 14:58:04 +09:00
109 changed files with 18833 additions and 4099 deletions

View File

@ -3,7 +3,8 @@ SBINDIR = @SBINDIR@
MANDIR = @MANDIR@ MANDIR = @MANDIR@
all:: all::
@(cd executer/kernel; make modules) @(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make) @(cd executer/user; make)
@case "$(TARGET)" in \ @case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \ attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -16,7 +17,8 @@ all::
esac esac
install:: install::
@(cd executer/kernel; make install) @(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install) @(cd executer/user; make install)
@case "$(TARGET)" in \ @case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \ attached-mic | builtin-x86 | builtin-mic | smp-x86) \
@ -56,7 +58,8 @@ install::
esac esac
clean:: clean::
@(cd executer/kernel; make clean) @(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean) @(cd executer/user; make clean)
@case "$(TARGET)" in \ @case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \ attached-mic | builtin-x86 | builtin-mic | smp-x86) \

View File

@ -10,7 +10,7 @@
* HISTORY * HISTORY
*/ */
#define X86_CPU_LOCAL_OFFSET_TSS 128 #define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4 #define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \ #define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0) (X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)

View File

@ -28,9 +28,12 @@
#include <signal.h> #include <signal.h>
#include <process.h> #include <process.h>
#include <cls.h> #include <cls.h>
#include <prctl.h>
#include <page.h>
#define LAPIC_ID 0x020 #define LAPIC_ID 0x020
#define LAPIC_TIMER 0x320 #define LAPIC_TIMER 0x320
#define LAPIC_LVTPC 0x340
#define LAPIC_TIMER_INITIAL 0x380 #define LAPIC_TIMER_INITIAL 0x380
#define LAPIC_TIMER_CURRENT 0x390 #define LAPIC_TIMER_CURRENT 0x390
#define LAPIC_TIMER_DIVIDE 0x3e0 #define LAPIC_TIMER_DIVIDE 0x3e0
@ -40,6 +43,7 @@
#define LAPIC_ICR2 0x310 #define LAPIC_ICR2 0x310
#define LAPIC_ESR 0x280 #define LAPIC_ESR 0x280
#define LOCAL_TIMER_VECTOR 0xef #define LOCAL_TIMER_VECTOR 0xef
#define LOCAL_PERF_VECTOR 0xf0
#define APIC_INT_LEVELTRIG 0x08000 #define APIC_INT_LEVELTRIG 0x08000
#define APIC_INT_ASSERT 0x04000 #define APIC_INT_ASSERT 0x04000
@ -52,15 +56,30 @@
#define APIC_DIVISOR 16 #define APIC_DIVISOR 16
#define APIC_LVT_TIMER_PERIODIC (1 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17)
#define APIC_BASE_MSR 0x800
#define IA32_X2APIC_APICID 0x802
#define IA32_X2APIC_ICR 0x830
#define X2APIC_ENABLE (1UL << 10)
#define NMI_VECTOR 0x02
//#define DEBUG_PRINT_CPU //#define DEBUG_PRINT_CPU
#ifdef DEBUG_PRINT_CPU #ifdef DEBUG_PRINT_CPU
#define dkprintf kprintf #define dkprintf kprintf
#define ekprintf kprintf
#else #else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) #define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf kprintf
#endif #endif
static void *lapic_vp;
static int x2apic;
static void (*lapic_write)(int reg, unsigned int value);
static unsigned int (*lapic_read)(int reg);
static void (*lapic_icr_write)(unsigned int h, unsigned int l);
static void (*lapic_wait_icr_idle)(void);
void (*x86_issue_ipi)(unsigned int apicid, unsigned int low);
int running_on_kvm(void);
void init_processors_local(int max_id); void init_processors_local(int max_id);
void assign_processor_id(void); void assign_processor_id(void);
@ -69,7 +88,9 @@ void x86_set_warm_reset(unsigned long ip, char *first_page_va);
void x86_init_perfctr(void); void x86_init_perfctr(void);
int gettime_local_support = 0; int gettime_local_support = 0;
extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt);
extern int kprintf(const char *format, ...); extern int kprintf(const char *format, ...);
extern int interrupt_from_user(void *);
static struct idt_entry{ static struct idt_entry{
uint32_t desc[4]; uint32_t desc[4];
@ -88,6 +109,12 @@ static uint64_t gdt[] __attribute__((aligned(16))) = {
0x00aff3000000ffff, /* 56 : USER_DS */ 0x00aff3000000ffff, /* 56 : USER_DS */
0x0000890000000067, /* 64 : TSS */ 0x0000890000000067, /* 64 : TSS */
0, /* (72: TSS) */ 0, /* (72: TSS) */
0, /* 80 */
0, /* 88 */
0, /* 96 */
0, /* 104 */
0, /* 112 */
0x0000f10000000000, /* 120 : GETCPU */
}; };
struct tss64 tss __attribute__((aligned(16))); struct tss64 tss __attribute__((aligned(16)));
@ -123,6 +150,12 @@ extern char debug_exception[], int3_exception[];
uint64_t boot_pat_state = 0; uint64_t boot_pat_state = 0;
int no_turbo = 0; /* May be updated by early parsing of kargs */ int no_turbo = 0; /* May be updated by early parsing of kargs */
extern int num_processors; /* kernel/ap.c */
struct pvclock_vsyscall_time_info *pvti = NULL;
int pvti_npages;
static long pvti_msr = -1;
static void init_idt(void) static void init_idt(void)
{ {
int i; int i;
@ -238,25 +271,39 @@ void init_gdt(void)
reload_gdt(&gdt_desc); reload_gdt(&gdt_desc);
} }
static void *lapic_vp; static void
void lapic_write(int reg, unsigned int value) apic_write(int reg, unsigned int value)
{ {
*(volatile unsigned int *)((char *)lapic_vp + reg) = value; *(volatile unsigned int *)((char *)lapic_vp + reg) = value;
} }
unsigned int lapic_read(int reg) static void
x2apic_write(int reg, unsigned int value)
{
reg >>= 4;
reg |= APIC_BASE_MSR;
wrmsr(reg, value);
}
static unsigned int
apic_read(int reg)
{ {
return *(volatile unsigned int *)((char *)lapic_vp + reg); return *(volatile unsigned int *)((char *)lapic_vp + reg);
} }
void lapic_icr_write(unsigned int h, unsigned int l) static unsigned int
x2apic_read(int reg)
{ {
lapic_write(LAPIC_ICR2, (unsigned int)h); unsigned long value;
lapic_write(LAPIC_ICR0, l);
reg >>= 4;
reg |= APIC_BASE_MSR;
value = rdmsr(reg);
return (int)value;
} }
void
void lapic_timer_enable(unsigned int clocks) lapic_timer_enable(unsigned int clocks)
{ {
unsigned int lvtt_value; unsigned int lvtt_value;
@ -268,11 +315,117 @@ void lapic_timer_enable(unsigned int clocks)
lapic_write(LAPIC_TIMER, lvtt_value); lapic_write(LAPIC_TIMER, lvtt_value);
} }
void lapic_timer_disable() void
lapic_timer_disable()
{ {
lapic_write(LAPIC_TIMER_INITIAL, 0); lapic_write(LAPIC_TIMER_INITIAL, 0);
} }
void
lapic_ack(void)
{
lapic_write(LAPIC_EOI, 0);
}
static void
x2apic_wait_icr_idle(void)
{
}
static void
apic_wait_icr_idle(void)
{
while (lapic_read(LAPIC_ICR0) & APIC_ICR_BUSY) {
cpu_pause();
}
}
static void
x2apic_icr_write(unsigned int low, unsigned int apicid)
{
wrmsr(IA32_X2APIC_ICR, (((unsigned long)apicid) << 32) | low);
}
static void
apic_icr_write(unsigned int h, unsigned int l)
{
lapic_write(LAPIC_ICR2, (unsigned int)h);
lapic_write(LAPIC_ICR0, l);
}
static void
x2apic_x86_issue_ipi(unsigned int apicid, unsigned int low)
{
unsigned long icr = low;
unsigned long flags;
ihk_mc_mb();
flags = cpu_disable_interrupt_save();
x2apic_icr_write(icr, apicid);
cpu_restore_interrupt(flags);
}
static void
apic_x86_issue_ipi(unsigned int apicid, unsigned int low)
{
unsigned long flags;
flags = cpu_disable_interrupt_save();
apic_wait_icr_idle();
apic_icr_write(apicid << LAPIC_ICR_ID_SHIFT, low);
cpu_restore_interrupt(flags);
}
unsigned long
x2apic_is_enabled()
{
unsigned long msr;
msr = rdmsr(MSR_IA32_APIC_BASE);
return (msr & X2APIC_ENABLE);
}
void init_lapic_bsp(void)
{
if(x2apic_is_enabled()){
x2apic = 1;
lapic_write = x2apic_write;
lapic_read = x2apic_read;
lapic_icr_write = x2apic_icr_write;
lapic_wait_icr_idle = x2apic_wait_icr_idle;
x86_issue_ipi = x2apic_x86_issue_ipi;
}
else{
x2apic = 0;
lapic_write = apic_write;
lapic_read = apic_read;
lapic_icr_write = apic_icr_write;
lapic_wait_icr_idle = apic_wait_icr_idle;
x86_issue_ipi = apic_x86_issue_ipi;
}
}
void
init_lapic()
{
if(!x2apic){
unsigned long baseaddr;
/* Enable Local APIC */
baseaddr = rdmsr(MSR_IA32_APIC_BASE);
if (!lapic_vp) {
lapic_vp = map_fixed_area(baseaddr & PAGE_MASK, PAGE_SIZE, 1);
}
baseaddr |= 0x800;
wrmsr(MSR_IA32_APIC_BASE, baseaddr);
}
lapic_write(LAPIC_SPURIOUS, 0x1ff);
lapic_write(LAPIC_LVTPC, LOCAL_PERF_VECTOR);
}
void print_msr(int idx) void print_msr(int idx)
{ {
int bit; int bit;
@ -302,6 +455,8 @@ void init_pstate_and_turbo(void)
uint64_t value; uint64_t value;
uint64_t eax, ecx; uint64_t eax, ecx;
if (running_on_kvm()) return;
asm volatile("cpuid" : "=a" (eax), "=c" (ecx) : "a" (0x6) : "%rbx", "%rdx"); asm volatile("cpuid" : "=a" (eax), "=c" (ecx) : "a" (0x6) : "%rbx", "%rdx");
if (!(ecx & 0x01)) { if (!(ecx & 0x01)) {
/* P-states and/or Turbo Boost are not supported. */ /* P-states and/or Turbo Boost are not supported. */
@ -423,26 +578,6 @@ void init_pat(void)
dkprintf("PAT support detected and reconfigured.\n"); dkprintf("PAT support detected and reconfigured.\n");
} }
void init_lapic(void)
{
unsigned long baseaddr;
/* Enable Local APIC */
baseaddr = rdmsr(MSR_IA32_APIC_BASE);
if (!lapic_vp) {
lapic_vp = map_fixed_area(baseaddr & PAGE_MASK, PAGE_SIZE, 1);
}
baseaddr |= 0x800;
wrmsr(MSR_IA32_APIC_BASE, baseaddr);
lapic_write(LAPIC_SPURIOUS, 0x1ff);
}
void lapic_ack(void)
{
lapic_write(LAPIC_EOI, 0);
}
static void set_kstack(unsigned long ptr) static void set_kstack(unsigned long ptr)
{ {
struct x86_cpu_local_variables *v; struct x86_cpu_local_variables *v;
@ -456,11 +591,17 @@ static void init_smp_processor(void)
{ {
struct x86_cpu_local_variables *v; struct x86_cpu_local_variables *v;
unsigned long tss_addr; unsigned long tss_addr;
unsigned node_cpu;
v = get_x86_this_cpu_local(); v = get_x86_this_cpu_local();
tss_addr = (unsigned long)&v->tss; tss_addr = (unsigned long)&v->tss;
v->apic_id = lapic_read(LAPIC_ID) >> LAPIC_ID_SHIFT; if(x2apic_is_enabled()){
v->apic_id = rdmsr(IA32_X2APIC_APICID);
}
else{
v->apic_id = lapic_read(LAPIC_ID) >> LAPIC_ID_SHIFT;
}
memcpy(v->gdt, gdt, sizeof(v->gdt)); memcpy(v->gdt, gdt, sizeof(v->gdt));
@ -471,6 +612,9 @@ static void init_smp_processor(void)
| (0x89UL << 40) | ((tss_addr & 0xff000000) << 32); | (0x89UL << 40) | ((tss_addr & 0xff000000) << 32);
v->gdt[GLOBAL_TSS_ENTRY + 1] = (tss_addr >> 32); v->gdt[GLOBAL_TSS_ENTRY + 1] = (tss_addr >> 32);
node_cpu = v->processor_id; /* assumes NUMA node 0 */
v->gdt[GETCPU_ENTRY] |= node_cpu;
v->gdt_ptr.size = sizeof(v->gdt) - 1; v->gdt_ptr.size = sizeof(v->gdt) - 1;
v->gdt_ptr.address = (unsigned long)v->gdt; v->gdt_ptr.address = (unsigned long)v->gdt;
@ -478,6 +622,11 @@ static void init_smp_processor(void)
reload_gdt(&v->gdt_ptr); reload_gdt(&v->gdt_ptr);
set_kstack((unsigned long)get_x86_this_cpu_kstack()); set_kstack((unsigned long)get_x86_this_cpu_kstack());
/* MSR_IA32_TSC_AUX on KVM seems broken */
if (running_on_kvm()) return;
#define MSR_IA32_TSC_AUX 0xc0000103
wrmsr(MSR_IA32_TSC_AUX, node_cpu);
} }
static char *trampoline_va, *first_page_va; static char *trampoline_va, *first_page_va;
@ -497,9 +646,6 @@ void ihk_mc_init_ap(void)
kprintf("# of cpus : %d\n", cpu_info->ncpus); kprintf("# of cpus : %d\n", cpu_info->ncpus);
init_processors_local(cpu_info->ncpus); init_processors_local(cpu_info->ncpus);
kprintf("IKC IRQ vector: %d, IKC target CPU APIC: %d\n",
ihk_ikc_irq, ihk_ikc_irq_apicid);
/* Do initialization for THIS cpu (BSP) */ /* Do initialization for THIS cpu (BSP) */
assign_processor_id(); assign_processor_id();
@ -621,6 +767,8 @@ void setup_x86(void)
check_no_execute(); check_no_execute();
init_lapic_bsp();
init_cpu(); init_cpu();
init_gettime_support(); init_gettime_support();
@ -671,6 +819,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
lapic_ack(); lapic_ack();
++v->in_interrupt; ++v->in_interrupt;
set_cputime(interrupt_from_user(regs)? 1: 2);
dkprintf("CPU[%d] got interrupt, vector: %d, RIP: 0x%lX\n", dkprintf("CPU[%d] got interrupt, vector: %d, RIP: 0x%lX\n",
ihk_mc_get_processor_id(), vector, regs->gpr.rip); ihk_mc_get_processor_id(), vector, regs->gpr.rip);
@ -732,6 +882,15 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate); ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc()); dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
} }
else if (vector == LOCAL_PERF_VECTOR) {
unsigned long value;
value = rdmsr(MSR_PERF_GLOBAL_STATUS);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, value);
wrmsr(MSR_PERF_GLOBAL_OVF_CTRL, 0);
//TODO: counter overflow signal
//set_signal(0x1d, regs, NULL); // SIGIO
}
else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START && else if (vector >= IHK_TLB_FLUSH_IRQ_VECTOR_START &&
vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) { vector < IHK_TLB_FLUSH_IRQ_VECTOR_END) {
@ -745,14 +904,19 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
} }
} }
check_signal(0, regs, 0); if(interrupt_from_user(regs)){
check_need_resched(); cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
--v->in_interrupt; --v->in_interrupt;
} }
void gpe_handler(struct x86_user_context *regs) void gpe_handler(struct x86_user_context *regs)
{ {
set_cputime(interrupt_from_user(regs)? 1: 2);
kprintf("General protection fault (err: %lx, %lx:%lx)\n", kprintf("General protection fault (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip); regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
arch_show_interrupt_context(regs); arch_show_interrupt_context(regs);
@ -760,8 +924,12 @@ void gpe_handler(struct x86_user_context *regs)
panic("gpe_handler"); panic("gpe_handler");
} }
set_signal(SIGSEGV, regs, NULL); set_signal(SIGSEGV, regs, NULL);
check_signal(0, regs, 0); if(interrupt_from_user(regs)){
check_need_resched(); cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
// panic("GPF"); // panic("GPF");
} }
@ -771,6 +939,7 @@ void debug_handler(struct x86_user_context *regs)
int si_code = 0; int si_code = 0;
struct siginfo info; struct siginfo info;
set_cputime(interrupt_from_user(regs)? 1: 2);
#ifdef DEBUG_PRINT_CPU #ifdef DEBUG_PRINT_CPU
kprintf("debug exception (err: %lx, %lx:%lx)\n", kprintf("debug exception (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip); regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
@ -788,14 +957,19 @@ void debug_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info); memset(&info, '\0', sizeof info);
info.si_code = si_code; info.si_code = si_code;
set_signal(SIGTRAP, regs, &info); set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0); if(interrupt_from_user(regs)){
check_need_resched(); cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
} }
void int3_handler(struct x86_user_context *regs) void int3_handler(struct x86_user_context *regs)
{ {
struct siginfo info; struct siginfo info;
set_cputime(interrupt_from_user(regs)? 1: 2);
#ifdef DEBUG_PRINT_CPU #ifdef DEBUG_PRINT_CPU
kprintf("int3 exception (err: %lx, %lx:%lx)\n", kprintf("int3 exception (err: %lx, %lx:%lx)\n",
regs->gpr.error, regs->gpr.cs, regs->gpr.rip); regs->gpr.error, regs->gpr.cs, regs->gpr.rip);
@ -805,25 +979,68 @@ void int3_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info); memset(&info, '\0', sizeof info);
info.si_code = TRAP_BRKPT; info.si_code = TRAP_BRKPT;
set_signal(SIGTRAP, regs, &info); set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0); if(interrupt_from_user(regs)){
check_need_resched(); cpu_enable_interrupt();
} check_signal(0, regs, 0);
check_need_resched();
static void wait_icr_idle(void)
{
while (lapic_read(LAPIC_ICR0) & APIC_ICR_BUSY) {
cpu_pause();
} }
set_cputime(0);
} }
void x86_issue_ipi(unsigned int apicid, unsigned int low) void
unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
{ {
unsigned long flags; const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
flags = cpu_disable_interrupt_save(); irqflags = kprintf_lock();
wait_icr_idle(); dkprintf("[%d] Page fault for 0x%lX\n",
lapic_icr_write(apicid << LAPIC_ICR_ID_SHIFT, low); ihk_mc_get_processor_id(), address);
cpu_restore_interrupt(flags); dkprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
(error & PF_USER ? "user" : "kernel"),
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
dkprintf("address is in range, flag: 0x%X! \n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
dkprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
//dkprintf("now dump a core file\n");
//coredump(proc, regs);
#ifdef DEBUG_PRINT_MEM
{
uint64_t *sp = (void *)REGS_GET_STACK_POINTER(regs);
kprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n",
sp[0], sp[1], sp[2], sp[3]);
}
#endif
return;
} }
static void outb(uint8_t v, uint16_t port) static void outb(uint8_t v, uint16_t port)
@ -852,12 +1069,12 @@ static void __x86_wakeup(int apicid, unsigned long ip)
x86_issue_ipi(apicid, x86_issue_ipi(apicid,
APIC_INT_LEVELTRIG | APIC_DM_INIT); APIC_INT_LEVELTRIG | APIC_DM_INIT);
wait_icr_idle(); lapic_wait_icr_idle();
while (retry--) { while (retry--) {
lapic_read(LAPIC_ESR); lapic_read(LAPIC_ESR);
x86_issue_ipi(apicid, APIC_DM_STARTUP | (ip >> 12)); x86_issue_ipi(apicid, APIC_DM_STARTUP | (ip >> 12));
wait_icr_idle(); lapic_wait_icr_idle();
arch_delay(200); arch_delay(200);
@ -868,6 +1085,10 @@ static void __x86_wakeup(int apicid, unsigned long ip)
/** IHK Functions **/ /** IHK Functions **/
/*@
@ assigns \nothing;
@ ensures \interrupt_disabled == 0;
@*/
void cpu_halt(void) void cpu_halt(void)
{ {
asm volatile("hlt"); asm volatile("hlt");
@ -1170,6 +1391,10 @@ void arch_show_extended_context(void)
} }
#endif #endif
/*@
@ requires \valid(reg);
@ assigns \nothing;
@*/
void arch_show_interrupt_context(const void *reg) void arch_show_interrupt_context(const void *reg)
{ {
const struct x86_user_context *uctx = reg; const struct x86_user_context *uctx = reg;
@ -1258,8 +1483,8 @@ int ihk_mc_interrupt_cpu(int cpu, int vector)
} }
/*@ /*@
@ requires \valid(proc); @ requires \valid(thread);
@ ensures proc->fp_regs == NULL; @ ensures thread->fp_regs == NULL;
@*/ @*/
void void
release_fp_regs(struct thread *thread) release_fp_regs(struct thread *thread)
@ -1274,6 +1499,9 @@ release_fp_regs(struct thread *thread)
thread->fp_regs = NULL; thread->fp_regs = NULL;
} }
/*@
@ requires \valid(thread);
@*/
void void
save_fp_regs(struct thread *thread) save_fp_regs(struct thread *thread)
{ {
@ -1305,6 +1533,10 @@ save_fp_regs(struct thread *thread)
} }
} }
/*@
@ requires \valid(thread);
@ assigns thread->fp_regs;
@*/
void void
restore_fp_regs(struct thread *thread) restore_fp_regs(struct thread *thread)
{ {
@ -1353,8 +1585,186 @@ ihk_mc_user_context_t *lookup_user_context(struct thread *thread)
return uctx; return uctx;
} /* lookup_user_context() */ } /* lookup_user_context() */
extern long do_arch_prctl(unsigned long code, unsigned long address);
void zero_tsc(void) void
ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
unsigned long tls_base_addr)
{ {
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0); do_arch_prctl(ARCH_SET_FS, tls_base_addr);
} }
/*@
@ assigns \nothing;
@*/
void init_tick(void)
{
dkprintf("init_tick():\n");
return;
}
/*@
@ assigns \nothing;
@*/
void init_delay(void)
{
dkprintf("init_delay():\n");
return;
}
/*@
@ assigns \nothing;
@*/
void sync_tick(void)
{
dkprintf("sync_tick():\n");
return;
}
static int is_pvclock_available(void)
{
uint32_t eax;
uint32_t ebx;
uint32_t ecx;
uint32_t edx;
dkprintf("is_pvclock_available()\n");
#define KVM_CPUID_SIGNATURE 0x40000000
asm ("cpuid" : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
: "a" (KVM_CPUID_SIGNATURE));
if ((eax && (eax < 0x40000001))
|| (ebx != 0x4b4d564b)
|| (ecx != 0x564b4d56)
|| (edx != 0x0000004d)) {
dkprintf("is_pvclock_available(): false (not kvm)\n");
return 0;
}
#define KVM_CPUID_FEATURES 0x40000001
asm ("cpuid" : "=a"(eax)
: "a"(KVM_CPUID_FEATURES)
: "%ebx", "%ecx", "%edx");
#define KVM_FEATURE_CLOCKSOURCE2 3
if (eax & (1 << KVM_FEATURE_CLOCKSOURCE2)) {
#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
pvti_msr = MSR_KVM_SYSTEM_TIME_NEW;
dkprintf("is_pvclock_available(): true (new)\n");
return 1;
}
#define KVM_FEATURE_CLOCKSOURCE 0
else if (eax & (1 << KVM_FEATURE_CLOCKSOURCE)) {
#define MSR_KVM_SYSTEM_TIME 0x12
pvti_msr = MSR_KVM_SYSTEM_TIME;
dkprintf("is_pvclock_available(): true (old)\n");
return 1;
}
dkprintf("is_pvclock_available(): false (not supported)\n");
return 0;
} /* is_pvclock_available() */
int arch_setup_pvclock(void)
{
size_t size;
int npages;
dkprintf("arch_setup_pvclock()\n");
if (!is_pvclock_available()) {
dkprintf("arch_setup_pvclock(): not supported\n");
return 0;
}
size = num_processors * sizeof(*pvti);
npages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
pvti_npages = npages;
pvti = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!pvti) {
ekprintf("arch_setup_pvclock: allocate_pages failed.\n");
return -ENOMEM;
}
memset(pvti, 0, PAGE_SIZE*npages);
dkprintf("arch_setup_pvclock(): ok\n");
return 0;
} /* arch_setup_pvclock() */
void arch_start_pvclock(void)
{
int cpu;
intptr_t phys;
dkprintf("arch_start_pvclock()\n");
if (!pvti) {
dkprintf("arch_start_pvclock(): not supported\n");
return;
}
cpu = ihk_mc_get_processor_id();
phys = virt_to_phys(&pvti[cpu]);
#define KVM_SYSTEM_TIME_ENABLE 0x1
wrmsr(pvti_msr, phys|KVM_SYSTEM_TIME_ENABLE);
dkprintf("arch_start_pvclock(): ok\n");
return;
} /* arch_start_pvclock() */
static struct cpu_mapping *cpu_mapping = NULL;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp)
{
int error;
size_t size;
int npages;
struct cpu_mapping *mapping;
int cpu;
struct x86_cpu_local_variables *v;
if (!cpu_mapping) {
size = sizeof(*mapping) * num_processors;
npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
mapping = allocate_pages(npages, IHK_MC_AP_NOWAIT);
if (!mapping) {
error = -ENOMEM;
ekprintf("arch_get_cpu_mapping:allocate_pages failed. %d\n", error);
goto out;
}
for (cpu = 0; cpu < num_processors; ++cpu) {
v = get_x86_cpu_local_variable(cpu);
mapping[cpu].cpu_number = cpu;
mapping[cpu].hw_id = v->apic_id;
}
cpu_mapping = mapping;
}
error = 0;
*buf = cpu_mapping;
*nelemsp = num_processors;
out:
return error;
} /* arch_get_cpu_mapping() */
#define KVM_CPUID_SIGNATURE 0x40000000
int running_on_kvm(void) {
static const char signature[12] = "KVMKVMKVM\0\0";
const uint32_t *sigptr = (const uint32_t *)signature;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
op = KVM_CPUID_SIGNATURE;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx) : "a" (op));
if (ebx == sigptr[0] && ecx == sigptr[1] && edx == sigptr[2]) {
return 1;
}
return 0;
}
/*** end of file ***/

View File

@ -271,6 +271,17 @@ void fill_note(void *note, struct thread *thread, void *regs)
* should be zero. * should be zero.
*/ */
/*@
@ requires \valid(thread);
@ requires \valid(regs);
@ requires \valid(coretable);
@ requires \valid(chunks);
@ behavior success:
@ ensures \result == 0;
@ assigns coretable;
@ behavior failure:
@ ensures \result == -1;
@*/
int gencore(struct thread *thread, void *regs, int gencore(struct thread *thread, void *regs,
struct coretable **coretable, int *chunks) struct coretable **coretable, int *chunks)
{ {
@ -510,6 +521,10 @@ int gencore(struct thread *thread, void *regs,
* \param coretable An array of core chunks. * \param coretable An array of core chunks.
*/ */
/*@
@ requires \valid(coretable);
@ assigns \nothing;
@*/
void freecore(struct coretable **coretable) void freecore(struct coretable **coretable)
{ {
struct coretable *ct = *coretable; struct coretable *ct = *coretable;

View File

@ -0,0 +1,96 @@
/**
* \file arch-bitops.h
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#endif

View File

@ -0,0 +1,67 @@
/**
* \file futex.h
* Licence details are found in the file LICENSE.
*
* \brief
* Futex adaptation to McKernel
*
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
*
*
* HISTORY:
*
*/
#ifndef _ARCH_FUTEX_H
#define _ARCH_FUTEX_H
#include <asm.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -248,6 +248,7 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
struct mcs_rwlock_node *p; struct mcs_rwlock_node *p;
struct mcs_rwlock_node *f = NULL; struct mcs_rwlock_node *f = NULL;
struct mcs_rwlock_node *n; struct mcs_rwlock_node *n;
int breakf = 0;
ihk_atomic_inc(&lock->reader.count); // protect to unlock reader ihk_atomic_inc(&lock->reader.count); // protect to unlock reader
for(p = &lock->reader; p->next; p = n){ for(p = &lock->reader; p->next; p = n){
@ -268,6 +269,9 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
} }
p->next = n->next; p->next = n->next;
} }
else{
breakf = 1;
}
} }
else if(p->next == NULL){ else if(p->next == NULL){
while (n->next == NULL) { while (n->next == NULL) {
@ -282,6 +286,8 @@ mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
else else
f = n; f = n;
n = p; n = p;
if(breakf)
break;
} }
if(n->next == NULL && lock->node != n){ if(n->next == NULL && lock->node != n){
while (n->next == NULL && lock->node != n) { while (n->next == NULL && lock->node != n) {
@ -340,6 +346,24 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock_noirq\n", ihk_mc_get_processor_id());
#else #else
#define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq #define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq
#endif #endif
static inline unsigned int
atomic_inc_ifnot0(ihk_atomic_t *v)
{
unsigned int *p = (unsigned int *)(&(v)->counter);
unsigned int old;
unsigned int new;
unsigned int val;
do{
if(!(old = *p))
break;
new = old + 1;
val = atomic_cmpxchg4(p, old, new);
}while(val != old);
return old;
}
static void static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node) __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{ {
@ -356,7 +380,7 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
if (pred) { if (pred) {
if(pred == &lock->reader){ if(pred == &lock->reader){
if(ihk_atomic_inc_return(&pred->count) != 1){ if(atomic_inc_ifnot0(&pred->count)){
struct mcs_rwlock_node *old; struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8( old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
@ -372,12 +396,12 @@ __mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
cpu_pause(); cpu_pause();
} }
pred->next = node->next; node->locked = MCS_RWLOCK_LOCKED;
if(node->next->type == MCS_RWLOCK_TYPE_READER) lock->reader.next = node;
mcs_rwlock_unlock_readers(lock); mcs_rwlock_unlock_readers(lock);
ihk_atomic_dec(&pred->count);
goto out; goto out;
} }
ihk_atomic_dec(&pred->count);
} }
node->locked = MCS_RWLOCK_LOCKED; node->locked = MCS_RWLOCK_LOCKED;
pred->next = node; pred->next = node;

View File

@ -22,6 +22,7 @@
#define USER_CS_ENTRY 6 #define USER_CS_ENTRY 6
#define USER_DS_ENTRY 7 #define USER_DS_ENTRY 7
#define GLOBAL_TSS_ENTRY 8 #define GLOBAL_TSS_ENTRY 8
#define GETCPU_ENTRY 15
#define KERNEL_CS (KERNEL_CS_ENTRY * 8) #define KERNEL_CS (KERNEL_CS_ENTRY * 8)
#define KERNEL_DS (KERNEL_DS_ENTRY * 8) #define KERNEL_DS (KERNEL_DS_ENTRY * 8)
@ -40,10 +41,12 @@
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT) #define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL #define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
#define MAP_ST_START 0xffff800000000000UL #define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL #define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL #define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL #define MAP_KERNEL_START 0xffffffff80000000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL #define MAP_VMAP_SIZE 0x0000000100000000UL
@ -65,6 +68,8 @@
#define PF_PRESENT ((pte_t)0x01) /* entry is valid */ #define PF_PRESENT ((pte_t)0x01) /* entry is valid */
#define PF_WRITABLE ((pte_t)0x02) #define PF_WRITABLE ((pte_t)0x02)
#define PFLX_PWT ((pte_t)0x08)
#define PFLX_PCD ((pte_t)0x10)
#define PF_SIZE ((pte_t)0x80) /* entry points large page */ #define PF_SIZE ((pte_t)0x80) /* entry points large page */
#define PFL4_PRESENT ((pte_t)0x01) #define PFL4_PRESENT ((pte_t)0x01)
@ -74,8 +79,8 @@
#define PFL3_PRESENT ((pte_t)0x01) #define PFL3_PRESENT ((pte_t)0x01)
#define PFL3_WRITABLE ((pte_t)0x02) #define PFL3_WRITABLE ((pte_t)0x02)
#define PFL3_USER ((pte_t)0x04) #define PFL3_USER ((pte_t)0x04)
#define PFL3_PWT ((pte_t)0x08) #define PFL3_PWT PFLX_PWT
#define PFL3_PCD ((pte_t)0x10) #define PFL3_PCD PFLX_PCD
#define PFL3_ACCESSED ((pte_t)0x20) #define PFL3_ACCESSED ((pte_t)0x20)
#define PFL3_DIRTY ((pte_t)0x40) #define PFL3_DIRTY ((pte_t)0x40)
#define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */ #define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */
@ -86,8 +91,8 @@
#define PFL2_PRESENT ((pte_t)0x01) #define PFL2_PRESENT ((pte_t)0x01)
#define PFL2_WRITABLE ((pte_t)0x02) #define PFL2_WRITABLE ((pte_t)0x02)
#define PFL2_USER ((pte_t)0x04) #define PFL2_USER ((pte_t)0x04)
#define PFL2_PWT ((pte_t)0x08) #define PFL2_PWT PFLX_PWT
#define PFL2_PCD ((pte_t)0x10) #define PFL2_PCD PFLX_PCD
#define PFL2_ACCESSED ((pte_t)0x20) #define PFL2_ACCESSED ((pte_t)0x20)
#define PFL2_DIRTY ((pte_t)0x40) #define PFL2_DIRTY ((pte_t)0x40)
#define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */ #define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */
@ -98,8 +103,8 @@
#define PFL1_PRESENT ((pte_t)0x01) #define PFL1_PRESENT ((pte_t)0x01)
#define PFL1_WRITABLE ((pte_t)0x02) #define PFL1_WRITABLE ((pte_t)0x02)
#define PFL1_USER ((pte_t)0x04) #define PFL1_USER ((pte_t)0x04)
#define PFL1_PWT ((pte_t)0x08) #define PFL1_PWT PFLX_PWT
#define PFL1_PCD ((pte_t)0x10) #define PFL1_PCD PFLX_PCD
#define PFL1_ACCESSED ((pte_t)0x20) #define PFL1_ACCESSED ((pte_t)0x20)
#define PFL1_DIRTY ((pte_t)0x40) #define PFL1_DIRTY ((pte_t)0x40)
#define PFL1_IGNORED_11 ((pte_t)1 << 11) #define PFL1_IGNORED_11 ((pte_t)1 << 11)
@ -152,6 +157,8 @@ enum ihk_mc_pt_attribute {
PTATTR_WRITE_COMBINED = 0x40000, PTATTR_WRITE_COMBINED = 0x40000,
}; };
enum ihk_mc_pt_attribute attr_mask;
static inline int pte_is_null(pte_t *ptep) static inline int pte_is_null(pte_t *ptep)
{ {
return (*ptep == PTE_NULL); return (*ptep == PTE_NULL);
@ -207,6 +214,27 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize)
return (off_t)(*ptep & PAGE_MASK); return (off_t)(*ptep & PAGE_MASK);
} }
static inline enum ihk_mc_pt_attribute pte_get_attr(pte_t *ptep, size_t pgsize)
{
enum ihk_mc_pt_attribute attr;
attr = *ptep & attr_mask;
if (*ptep & PFLX_PWT) {
if (*ptep & PFLX_PCD) {
attr |= PTATTR_UNCACHABLE;
}
else {
attr |= PTATTR_WRITE_COMBINED;
}
}
if (((pgsize == PTL2_SIZE) && (*ptep & PFL2_SIZE))
|| ((pgsize == PTL3_SIZE) && (*ptep & PFL3_SIZE))) {
attr |= PTATTR_LARGEPAGE;
}
return attr;
} /* pte_get_attr() */
static inline void pte_make_null(pte_t *ptep, size_t pgsize) static inline void pte_make_null(pte_t *ptep, size_t pgsize)
{ {
*ptep = PTE_NULL; *ptep = PTE_NULL;

View File

@ -0,0 +1,18 @@
/**
* \file auxvec.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent constants for auxiliary vector
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_AUXVEC_H
#define ARCH_AUXVEC_H
#define AT_SYSINFO_EHDR 33
#endif

View File

@ -25,4 +25,13 @@ static inline void wmb(void)
barrier(); barrier();
} }
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
#endif /* ARCH_CPU_H */ #endif /* ARCH_CPU_H */

View File

@ -0,0 +1,16 @@
#ifndef __ARCH_MM_H
#define __ARCH_MM_H
struct process_vm;
static inline void
flush_nfo_tlb()
{
}
static inline void
flush_nfo_tlb_mm(struct process_vm *vm)
{
}
#endif

View File

@ -27,6 +27,10 @@
#define MAP_STACK 0x00020000 #define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000 #define MAP_HUGETLB 0x00040000
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
/* /*
* for mlockall() * for mlockall()
*/ */

View File

@ -13,6 +13,11 @@
#ifndef HEADER_ARCH_SHM_H #ifndef HEADER_ARCH_SHM_H
#define HEADER_ARCH_SHM_H #define HEADER_ARCH_SHM_H
/* shmflg */
#define SHM_HUGE_SHIFT 26
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
struct ipc_perm { struct ipc_perm {
key_t key; key_t key;
uid_t uid; uid_t uid;
@ -34,7 +39,8 @@ struct shmid_ds {
pid_t shm_cpid; pid_t shm_cpid;
pid_t shm_lpid; pid_t shm_lpid;
uint64_t shm_nattch; uint64_t shm_nattch;
uint8_t padding[16]; uint8_t padding[12];
int init_pgshift;
}; };
#endif /* HEADER_ARCH_SHM_H */ #endif /* HEADER_ARCH_SHM_H */

View File

@ -22,7 +22,7 @@
* - 4096 : kernel stack * - 4096 : kernel stack
*/ */
#define X86_CPU_LOCAL_OFFSET_TSS 128 #define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_CPU_LOCAL_OFFSET_KSTACK 16 #define X86_CPU_LOCAL_OFFSET_KSTACK 16
#define X86_CPU_LOCAL_OFFSET_USTACK 24 #define X86_CPU_LOCAL_OFFSET_USTACK 24
@ -39,13 +39,13 @@ struct x86_cpu_local_variables {
struct x86_desc_ptr gdt_ptr; struct x86_desc_ptr gdt_ptr;
unsigned short pad[3]; unsigned short pad[3];
/* 48 */ /* 48 */
uint64_t gdt[10]; uint64_t gdt[16];
/* 128 */ /* 176 */
struct tss64 tss; struct tss64 tss;
/* 232 */ /* 280 */
unsigned long paniced; unsigned long paniced;
uint64_t panic_regs[21]; uint64_t panic_regs[21];
/* 408 */ /* 456 */
} __attribute__((packed)); } __attribute__((packed));
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id); struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id);

View File

@ -1,40 +1,7 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H #ifndef _ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H #define _ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */ #include <generic-errno.h>
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#define EDEADLK 35 /* Resource deadlock would occur */ #define EDEADLK 35 /* Resource deadlock would occur */
#define ENAMETOOLONG 36 /* File name too long */ #define ENAMETOOLONG 36 /* File name too long */
@ -141,29 +108,4 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */ #define ERFKILL 132 /* Operation not possible due to RF-kill */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif #endif

View File

@ -202,4 +202,17 @@ static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
return oldval; return oldval;
} }
static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
unsigned int oldval,
unsigned int newval)
{
asm volatile("lock; cmpxchgl %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif #endif

View File

@ -31,9 +31,5 @@ typedef int64_t off_t;
#define NULL ((void *)0) #define NULL ((void *)0)
#define BITS_PER_LONG_SHIFT 6
#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT)
#endif #endif

View File

@ -0,0 +1,17 @@
/**
* \file prctl.h
* License details are found in the file LICENSE.
*/
/*
* HISTORY
*/
#ifndef __ARCH_PRCTL_H
#define __ARCH_PRCTL_H
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#endif

View File

@ -90,10 +90,6 @@ enum __rlimit_resource
#define RLIM_NLIMITS __RLIM_NLIMITS #define RLIM_NLIMITS __RLIM_NLIMITS
}; };
#include <generic-rlimit.h>
struct rlimit {
uint64_t rlim_cur; /* Soft limit */
uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#endif #endif

View File

@ -20,7 +20,7 @@
* syscall_name[] only, no handler exists. * syscall_name[] only, no handler exists.
*/ */
SYSCALL_DELEGATED(0, read) SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write) SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open) SYSCALL_DELEGATED(2, open)
SYSCALL_HANDLED(3, close) SYSCALL_HANDLED(3, close)
@ -35,7 +35,7 @@ SYSCALL_HANDLED(12, brk)
SYSCALL_HANDLED(13, rt_sigaction) SYSCALL_HANDLED(13, rt_sigaction)
SYSCALL_HANDLED(14, rt_sigprocmask) SYSCALL_HANDLED(14, rt_sigprocmask)
SYSCALL_HANDLED(15, rt_sigreturn) SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_DELEGATED(16, ioctl) SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64) SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64) SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev) SYSCALL_DELEGATED(20, writev)
@ -51,6 +51,8 @@ SYSCALL_HANDLED(30, shmat)
SYSCALL_HANDLED(31, shmctl) SYSCALL_HANDLED(31, shmctl)
SYSCALL_HANDLED(34, pause) SYSCALL_HANDLED(34, pause)
SYSCALL_HANDLED(35, nanosleep) SYSCALL_HANDLED(35, nanosleep)
SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid) SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone) SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork) SYSCALL_DELEGATED(57, fork)
@ -69,6 +71,8 @@ SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink) SYSCALL_DELEGATED(89, readlink)
SYSCALL_HANDLED(96, gettimeofday) SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit) SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace) SYSCALL_HANDLED(101, ptrace)
SYSCALL_HANDLED(102, getuid) SYSCALL_HANDLED(102, getuid)
SYSCALL_HANDLED(104, getgid) SYSCALL_HANDLED(104, getgid)
@ -107,6 +111,7 @@ SYSCALL_HANDLED(158, arch_prctl)
SYSCALL_HANDLED(160, setrlimit) SYSCALL_HANDLED(160, setrlimit)
SYSCALL_HANDLED(164, settimeofday) SYSCALL_HANDLED(164, settimeofday)
SYSCALL_HANDLED(186, gettid) SYSCALL_HANDLED(186, gettid)
SYSCALL_HANDLED(200, tkill)
SYSCALL_DELEGATED(201, time) SYSCALL_DELEGATED(201, time)
SYSCALL_HANDLED(202, futex) SYSCALL_HANDLED(202, futex)
SYSCALL_HANDLED(203, sched_setaffinity) SYSCALL_HANDLED(203, sched_setaffinity)
@ -116,6 +121,7 @@ SYSCALL_HANDLED(216, remap_file_pages)
SYSCALL_DELEGATED(217, getdents64) SYSCALL_DELEGATED(217, getdents64)
SYSCALL_HANDLED(218, set_tid_address) SYSCALL_HANDLED(218, set_tid_address)
SYSCALL_DELEGATED(220, semtimedop) SYSCALL_DELEGATED(220, semtimedop)
SYSCALL_HANDLED(228, clock_gettime)
SYSCALL_DELEGATED(230, clock_nanosleep) SYSCALL_DELEGATED(230, clock_nanosleep)
SYSCALL_HANDLED(231, exit_group) SYSCALL_HANDLED(231, exit_group)
SYSCALL_DELEGATED(232, epoll_wait) SYSCALL_DELEGATED(232, epoll_wait)
@ -132,6 +138,7 @@ SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait) SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd) SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4) SYSCALL_HANDLED(289, signalfd4)
SYSCALL_HANDLED(298, perf_event_open)
#ifdef DCFA_KMOD #ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call) SYSCALL_HANDLED(303, mod_call)
#endif #endif

View File

@ -13,7 +13,7 @@
* 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts * 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts
*/ */
#define X86_CPU_LOCAL_OFFSET_TSS 128 #define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4 #define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \ #define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0) (X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)
@ -209,7 +209,9 @@ enter_user_mode:
callq release_runq_lock callq release_runq_lock
movq $0, %rdi movq $0, %rdi
movq %rsp, %rsi movq %rsp, %rsi
call check_signal call check_signal
movq $0, %rdi
call set_cputime
POP_ALL_REGS POP_ALL_REGS
addq $8, %rsp addq $8, %rsp
iretq iretq

View File

@ -38,6 +38,11 @@ void init_processors_local(int max_id)
kprintf("locals = %p\n", locals); kprintf("locals = %p\n", locals);
} }
/*@
@ requires \valid(id);
@ ensures \result == locals + (LOCALS_SPAN * id);
@ assigns \nothing;
@*/
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id) struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id)
{ {
return (struct x86_cpu_local_variables *) return (struct x86_cpu_local_variables *)
@ -98,6 +103,10 @@ void init_boot_processor_local(void)
} }
/** IHK **/ /** IHK **/
/*@
@ ensures \result == %gs;
@ assigns \nothing;
*/
int ihk_mc_get_processor_id(void) int ihk_mc_get_processor_id(void)
{ {
int id; int id;
@ -107,6 +116,10 @@ int ihk_mc_get_processor_id(void)
return id; return id;
} }
/*@
@ ensures \result == (locals + (LOCALS_SPAN * %gs))->apic_id;
@ assigns \nothing;
*/
int ihk_mc_get_hardware_processor_id(void) int ihk_mc_get_hardware_processor_id(void)
{ {
struct x86_cpu_local_variables *v = get_x86_this_cpu_local(); struct x86_cpu_local_variables *v = get_x86_this_cpu_local();

File diff suppressed because it is too large Load Diff

View File

@ -14,17 +14,28 @@
#include <registers.h> #include <registers.h>
extern unsigned int *x86_march_perfmap; extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100 #define X86_CR4_PCE 0x00000100
void x86_init_perfctr(void) void x86_init_perfctr(void)
{ {
unsigned long reg; unsigned long reg;
unsigned long value = 0;
/* Do not do it on KVM */
if (running_on_kvm()) return;
/* Allow PMC to be read from user space */ /* Allow PMC to be read from user space */
asm volatile("movq %%cr4, %0" : "=r"(reg)); asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE; reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg)); asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
} }
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value) static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -33,20 +44,51 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
return -EINVAL; return -EINVAL;
} }
if (mode & PERFCTR_USER_MODE) { // clear mode flags
value &= ~(3 << 16);
// set mode flags
if(mode & PERFCTR_USER_MODE) {
value |= 1 << 16; value |= 1 << 16;
} }
if (mode & PERFCTR_KERNEL_MODE) { if(mode & PERFCTR_KERNEL_MODE) {
value |= 1 << 17; value |= 1 << 17;
} }
// wrmsr(MSR_PERF_GLOBAL_CTRL, 0); // wrmsr(MSR_PERF_GLOBAL_CTRL, 0);
value |= (1 << 22) | (1 << 18); /* EN */ value |= (1 << 22) | (1 << 18); /* EN */
value |= (1 << 20); /* Enable overflow interrupt */
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value); wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0); //kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value); kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, unsigned long val)
{
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
wrmsr(MSR_IA32_PMC0 + counter, val);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// set fixed pmc
wrmsr(MSR_IA32_FIXED_CTR0 + counter - X86_IA32_BASE_FIXED_PERF_COUNTERS, val);
}
else {
return -EINVAL;
}
return 0; return 0;
} }
@ -57,6 +99,45 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
CVAL2(event, mask, inv, count)); CVAL2(event, mask, inv, count));
} }
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0x7;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, code);
}
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode) int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
{ {
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) { if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
@ -78,14 +159,15 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask) int ihk_mc_perfctr_start(unsigned long counter_mask)
{ {
unsigned int value = 0; unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
#ifdef HAVE_MARCH_PERFCTR_START #ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask); x86_march_perfctr_start(counter_mask);
#endif #endif
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1); counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL); value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask; value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value); wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0; return 0;
@ -93,9 +175,10 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
int ihk_mc_perfctr_stop(unsigned long counter_mask) int ihk_mc_perfctr_stop(unsigned long counter_mask)
{ {
unsigned int value; unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1); counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL); value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask; value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value); wrmsr(MSR_PERF_GLOBAL_CTRL, value);
@ -103,17 +186,48 @@ int ihk_mc_perfctr_stop(unsigned long counter_mask)
return 0; return 0;
} }
int ihk_mc_perfctr_reset(int counter) // init for fixed counter
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{ {
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) { unsigned long value = 0;
unsigned int ctr_mask = 0x7;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL; return -EINVAL;
} }
wrmsr(MSR_IA32_PMC0 + counter, 0); // clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0; return 0;
} }
int ihk_mc_perfctr_reset(int counter)
{
return set_pmc_x86_direct(counter, 0);
}
int ihk_mc_perfctr_set(int counter, unsigned long val)
{
return set_pmc_x86_direct(counter, val);
}
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value) int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
{ {
int i, j; int i, j;
@ -129,10 +243,77 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
unsigned long ihk_mc_perfctr_read(int counter) unsigned long ihk_mc_perfctr_read(int counter)
{ {
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) { unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL; return -EINVAL;
} }
return rdpmc(counter); cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
retval = rdpmc(counter);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
retval = rdpmc((1 << 30) + (counter - X86_IA32_BASE_FIXED_PERF_COUNTERS));
}
else {
retval = -EINVAL;
}
return retval;
} }
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
unsigned int idx = 0;
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
idx = MSR_IA32_PMC0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
idx = MSR_IA32_FIXED_CTR0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else {
retval = -EINVAL;
}
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned long pmc_status)
{
int i = 0;
int ret = -1;
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
pmc_status |= (1 << i);
break;
}
}
if(ret < 0){
return ret;
}
return ret;
}

View File

@ -24,21 +24,29 @@
#include <errno.h> #include <errno.h>
#include <kmalloc.h> #include <kmalloc.h>
#include <uio.h> #include <uio.h>
#include <mman.h>
#include <shm.h>
#include <prctl.h>
#include <ihk/ikc.h>
#include <page.h>
void terminate(int, int); void terminate(int, int);
int copy_from_user(void *dst, const void *src, size_t siz); extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
int copy_to_user(void *dst, const void *src, size_t siz);
int write_process_vm(struct process_vm *vm, void *dst, const void *src, size_t siz);
long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
long syscall(int num, ihk_mc_user_context_t *ctx); long syscall(int num, ihk_mc_user_context_t *ctx);
extern void save_fp_regs(struct thread *proc); extern void save_fp_regs(struct thread *proc);
void set_signal(int sig, void *regs0, siginfo_t *info);
void check_signal(unsigned long rc, void *regs0, int num);
extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long);
//#define DEBUG_PRINT_SC //#define DEBUG_PRINT_SC
#ifdef DEBUG_PRINT_SC #ifdef DEBUG_PRINT_SC
#define dkprintf kprintf #define dkprintf kprintf
#define ekprintf(...) kprintf(__VA_ARGS__)
#else #else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0) #define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif #endif
uintptr_t debug_constants[] = { uintptr_t debug_constants[] = {
@ -55,6 +63,10 @@ uintptr_t debug_constants[] = {
-1, -1,
}; };
static struct vdso vdso;
static size_t container_size = 0;
static ptrdiff_t vdso_offset;
/* /*
See dkprintf("BSP HW ID = %d, ", bsp_hw_id); (in ./mcos/kernel/ap.c) See dkprintf("BSP HW ID = %d, ", bsp_hw_id); (in ./mcos/kernel/ap.c)
@ -122,6 +134,17 @@ int obtain_clone_cpuid() {
return cpuid; return cpuid;
} }
int
arch_clear_host_user_space()
{
struct thread *th = cpu_local_var(current);
/* XXX: might be unnecessary */
clear_host_pte(th->vm->region.user_start,
(th->vm->region.user_end - th->vm->region.user_start));
return 0;
}
SYSCALL_DECLARE(rt_sigaction) SYSCALL_DECLARE(rt_sigaction)
{ {
int sig = ihk_mc_syscall_arg0(ctx); int sig = ihk_mc_syscall_arg0(ctx);
@ -152,12 +175,41 @@ fault:
} }
struct sigsp { struct sigsp {
struct x86_user_context regs; unsigned long flags;
void *link;
stack_t sigstack;
unsigned long regs[23];
#define _r8 regs[0]
#define _r9 regs[1]
#define _r10 regs[2]
#define _r11 regs[3]
#define _r12 regs[4]
#define _r13 regs[5]
#define _r14 regs[6]
#define _r15 regs[7]
#define _rdi regs[8]
#define _rsi regs[9]
#define _rbp regs[10]
#define _rbx regs[11]
#define _rdx regs[12]
#define _rax regs[13]
#define _rcx regs[14]
#define _rsp regs[15]
#define _rip regs[16]
#define _rflags regs[17]
#define _csgsfs regs[18]
#define _error regs[19]
#define _trapno regs[20]
#define _oldmask regs[21]
#define _cr2 regs[22]
void *fpregs;
unsigned long reserve[8];
unsigned long sigrc; unsigned long sigrc;
unsigned long sigmask; unsigned long sigmask;
int ssflags;
int num; int num;
int restart; int restart;
unsigned long ss;
siginfo_t info; siginfo_t info;
}; };
@ -165,19 +217,54 @@ SYSCALL_DECLARE(rt_sigreturn)
{ {
struct thread *thread = cpu_local_var(current); struct thread *thread = cpu_local_var(current);
struct x86_user_context *regs; struct x86_user_context *regs;
struct sigsp ksigsp;
struct sigsp *sigsp; struct sigsp *sigsp;
asm("movq %%gs:132, %0" : "=r" (regs)); asm ("movq %%gs:(%1),%0"
: "=r"(regs)
: "r"(offsetof(struct x86_cpu_local_variables, tss.rsp0)));
--regs; --regs;
sigsp = (struct sigsp *)regs->gpr.rsp; sigsp = (struct sigsp *)regs->gpr.rsp;
if(copy_from_user(regs, &sigsp->regs, sizeof(struct x86_user_context))) if(copy_from_user(&ksigsp, sigsp, sizeof ksigsp))
return -EFAULT; return -EFAULT;
thread->sigmask.__val[0] = sigsp->sigmask;
thread->sigstack.ss_flags = sigsp->ssflags; regs->gpr.r15 = ksigsp._r15;
regs->gpr.r14 = ksigsp._r14;
regs->gpr.r13 = ksigsp._r13;
regs->gpr.r12 = ksigsp._r12;
regs->gpr.rbp = ksigsp._rbp;
regs->gpr.rbx = ksigsp._rbx;
regs->gpr.r11 = ksigsp._r11;
regs->gpr.r10 = ksigsp._r10;
regs->gpr.r9 = ksigsp._r9;
regs->gpr.r8 = ksigsp._r8;
regs->gpr.rax = ksigsp._rax;
regs->gpr.rcx = ksigsp._rcx;
regs->gpr.rdx = ksigsp._rdx;
regs->gpr.rsi = ksigsp._rsi;
regs->gpr.rdi = ksigsp._rdi;
regs->gpr.error = ksigsp._error;
regs->gpr.rip = ksigsp._rip;
regs->gpr.rflags = ksigsp._rflags;
regs->gpr.rsp = ksigsp._rsp;
thread->sigmask.__val[0] = ksigsp._oldmask;
memcpy(&thread->sigstack, &ksigsp.sigstack, sizeof(stack_t));
if(sigsp->restart){ if(sigsp->restart){
return syscall(sigsp->num, (ihk_mc_user_context_t *)regs); return syscall(sigsp->num, (ihk_mc_user_context_t *)regs);
} }
if(regs->gpr.rflags & RFLAGS_TF){
struct siginfo info;
regs->gpr.rax = sigsp->sigrc;
memset(&info, '\0', sizeof info);
regs->gpr.rflags &= ~RFLAGS_TF;
info.si_code = TRAP_TRACE;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
}
return sigsp->sigrc; return sigsp->sigrc;
} }
@ -389,7 +476,6 @@ long ptrace_read_regset(struct thread *thread, long type, struct iovec *iov)
switch (type) { switch (type) {
case NT_X86_XSTATE: case NT_X86_XSTATE:
save_fp_regs(thread);
if (thread->fp_regs == NULL) { if (thread->fp_regs == NULL) {
return -ENOMEM; return -ENOMEM;
} }
@ -411,7 +497,6 @@ long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov)
switch (type) { switch (type) {
case NT_X86_XSTATE: case NT_X86_XSTATE:
save_fp_regs(thread);
if (thread->fp_regs == NULL) { if (thread->fp_regs == NULL) {
return -ENOMEM; return -ENOMEM;
} }
@ -455,6 +540,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
proc->signal_flags &= ~SIGNAL_STOP_STOPPED; proc->signal_flags &= ~SIGNAL_STOP_STOPPED;
} }
parent_pid = proc->parent->pid; parent_pid = proc->parent->pid;
save_debugreg(thread->ptrace_debugreg);
mcs_rwlock_writer_unlock(&proc->update_lock, &lock); mcs_rwlock_writer_unlock(&proc->update_lock, &lock);
memset(&info, '\0', sizeof info); memset(&info, '\0', sizeof info);
@ -471,6 +557,75 @@ void ptrace_report_signal(struct thread *thread, int sig)
schedule(); schedule();
dkprintf("ptrace_report_signal,wake up\n"); dkprintf("ptrace_report_signal,wake up\n");
} }
static long
ptrace_arch_prctl(int pid, long code, long addr)
{
long rc = -EIO;
struct thread *child;
struct mcs_rwlock_node_irqsave lock;
child = find_thread(pid, pid, &lock);
if (!child)
return -ESRCH;
if (child->proc->status & (PS_TRACED | PS_STOPPED)) {
switch (code) {
case ARCH_GET_FS: {
unsigned long value;
unsigned long *p = (unsigned long *)addr;
rc = ptrace_read_user(child,
offsetof(struct user_regs_struct, fs_base),
&value);
if (rc == 0) {
rc = copy_to_user(p, (char *)&value, sizeof(value));
}
break;
}
case ARCH_GET_GS: {
unsigned long value;
unsigned long *p = (unsigned long *)addr;
rc = ptrace_read_user(child,
offsetof(struct user_regs_struct, gs_base),
&value);
if (rc == 0) {
rc = copy_to_user(p, (char *)&value, sizeof(value));
}
break;
}
case ARCH_SET_FS:
rc = ptrace_write_user(child,
offsetof(struct user_regs_struct, fs_base),
(unsigned long)addr);
break;
case ARCH_SET_GS:
rc = ptrace_write_user(child,
offsetof(struct user_regs_struct, gs_base),
(unsigned long)addr);
break;
default:
rc = -EINVAL;
break;
}
}
thread_unlock(child, &lock);
return rc;
}
long
arch_ptrace(long request, int pid, long addr, long data)
{
switch(request) {
case PTRACE_ARCH_PRCTL:
return ptrace_arch_prctl(pid, data, addr);
break;
default:
break;
}
return -EOPNOTSUPP;
}
static int static int
isrestart(int num, unsigned long rc, int sig, int restart) isrestart(int num, unsigned long rc, int sig, int restart)
{ {
@ -531,7 +686,9 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
} }
if(regs == NULL){ /* call from syscall */ if(regs == NULL){ /* call from syscall */
asm("movq %%gs:132, %0" : "=r" (regs)); asm ("movq %%gs:(%1),%0"
: "=r"(regs)
: "r"(offsetof(struct x86_cpu_local_variables, tss.rsp0)));
--regs; --regs;
} }
else{ else{
@ -548,9 +705,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
} }
else if(k->sa.sa_handler){ else if(k->sa.sa_handler){
unsigned long *usp; /* user stack */ unsigned long *usp; /* user stack */
struct sigsp ksigsp;
struct sigsp *sigsp; struct sigsp *sigsp;
int ssflags = thread->sigstack.ss_flags;
unsigned long mask = (unsigned long)thread->sigmask.__val[0];
if((k->sa.sa_flags & SA_ONSTACK) && if((k->sa.sa_flags & SA_ONSTACK) &&
!(thread->sigstack.ss_flags & SS_DISABLE) && !(thread->sigstack.ss_flags & SS_DISABLE) &&
@ -565,37 +721,73 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
} }
sigsp = ((struct sigsp *)usp) - 1; sigsp = ((struct sigsp *)usp) - 1;
sigsp = (struct sigsp *)((unsigned long)sigsp & 0xfffffffffffffff0UL); sigsp = (struct sigsp *)((unsigned long)sigsp & 0xfffffffffffffff0UL);
if(write_process_vm(thread->vm, &sigsp->regs, regs, sizeof(struct x86_user_context)) || memset(&ksigsp, '\0', sizeof ksigsp);
write_process_vm(thread->vm, &sigsp->sigrc, &rc, sizeof(long))){
ksigsp._r15 = regs->gpr.r15;
ksigsp._r14 = regs->gpr.r14;
ksigsp._r13 = regs->gpr.r13;
ksigsp._r12 = regs->gpr.r12;
ksigsp._rbp = regs->gpr.rbp;
ksigsp._rbx = regs->gpr.rbx;
ksigsp._r11 = regs->gpr.r11;
ksigsp._r10 = regs->gpr.r10;
ksigsp._r9 = regs->gpr.r9;
ksigsp._r8 = regs->gpr.r8;
ksigsp._rax = regs->gpr.rax;
ksigsp._rcx = regs->gpr.rcx;
ksigsp._rdx = regs->gpr.rdx;
ksigsp._rsi = regs->gpr.rsi;
ksigsp._rdi = regs->gpr.rdi;
ksigsp._error = regs->gpr.error;
ksigsp._rip = regs->gpr.rip;
ksigsp._rflags = regs->gpr.rflags;
ksigsp._rsp = regs->gpr.rsp;
ksigsp._cr2 = (unsigned long)pending->info._sifields._sigfault.si_addr;
ksigsp._oldmask = thread->sigmask.__val[0];
memcpy(&ksigsp.sigstack, &thread->sigstack, sizeof(stack_t));
ksigsp.sigrc = rc;
ksigsp.num = num;
ksigsp.restart = isrestart(num, rc, sig, k->sa.sa_flags & SA_RESTART);
if(num != 0 && rc == -EINTR && sig == SIGCHLD)
ksigsp.restart = 1;
memcpy(&ksigsp.info, &pending->info, sizeof(siginfo_t));
if(copy_to_user(sigsp, &ksigsp, sizeof ksigsp)){
kfree(pending); kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate); ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
kprintf("do_signal,write_process_vm failed\n"); kprintf("do_signal,write_process_vm failed\n");
terminate(0, sig); terminate(0, sig);
return; return;
} }
sigsp->sigmask = mask;
sigsp->ssflags = ssflags;
sigsp->num = num;
sigsp->restart = isrestart(num, rc, sig, k->sa.sa_flags & SA_RESTART);
if(num != 0 && rc == -EINTR && sig == SIGCHLD)
sigsp->restart = 1;
memcpy(&sigsp->info, &pending->info, sizeof(siginfo_t));
usp = (unsigned long *)sigsp; usp = (unsigned long *)sigsp;
usp--; usp--;
*usp = (unsigned long)k->sa.sa_restorer; *usp = (unsigned long)k->sa.sa_restorer;
regs->gpr.rdi = (unsigned long)sig; regs->gpr.rdi = (unsigned long)sig;
if(k->sa.sa_flags & SA_SIGINFO){ regs->gpr.rsi = (unsigned long)&sigsp->info;
regs->gpr.rsi = (unsigned long)&sigsp->info; regs->gpr.rdx = (unsigned long)sigsp;
regs->gpr.rdx = 0;
}
regs->gpr.rip = (unsigned long)k->sa.sa_handler; regs->gpr.rip = (unsigned long)k->sa.sa_handler;
regs->gpr.rsp = (unsigned long)usp; regs->gpr.rsp = (unsigned long)usp;
thread->sigmask.__val[0] |= pending->sigmask.__val[0]; if(!(k->sa.sa_flags & SA_NODEFER))
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
kfree(pending); kfree(pending);
ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate); ihk_mc_spinlock_unlock(&thread->sigcommon->lock, irqstate);
if(regs->gpr.rflags & RFLAGS_TF){
struct siginfo info;
memset(&info, '\0', sizeof info);
regs->gpr.rflags &= ~RFLAGS_TF;
info.si_code = TRAP_TRACE;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
}
} }
else { else {
int coredumped = 0; int coredumped = 0;
@ -617,16 +809,16 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
case SIGTSTP: case SIGTSTP:
case SIGTTIN: case SIGTTIN:
case SIGTTOU: case SIGTTOU:
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_STOPPED;
info._sifields._sigchld.si_pid = thread->proc->pid;
info._sifields._sigchld.si_status = (sig << 8) | 0x7f;
do_kill(cpu_local_var(current), thread->proc->parent->pid, -1, SIGCHLD, &info, 0);
if(ptraceflag){ if(ptraceflag){
ptrace_report_signal(thread, orgsig); ptrace_report_signal(thread, orgsig);
} }
else{ else{
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_STOPPED;
info._sifields._sigchld.si_pid = thread->proc->pid;
info._sifields._sigchld.si_status = (sig << 8) | 0x7f;
do_kill(cpu_local_var(current), thread->proc->parent->pid, -1, SIGCHLD, &info, 0);
dkprintf("do_signal,SIGSTOP,changing state\n"); dkprintf("do_signal,SIGSTOP,changing state\n");
/* Update thread state in fork tree */ /* Update thread state in fork tree */
@ -679,6 +871,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
info._sifields._sigchld.si_status = 0x0000ffff; info._sifields._sigchld.si_status = 0x0000ffff;
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0); do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
proc->signal_flags = SIGNAL_STOP_CONTINUED; proc->signal_flags = SIGNAL_STOP_CONTINUED;
proc->status = PS_RUNNING;
dkprintf("do_signal,SIGCONT,do nothing\n"); dkprintf("do_signal,SIGCONT,do nothing\n");
break; break;
case SIGQUIT: case SIGQUIT:
@ -757,6 +950,14 @@ hassigpending(struct thread *thread)
return getsigpending(thread, 0); return getsigpending(thread, 0);
} }
int
interrupt_from_user(void *regs0)
{
struct x86_user_context *regs = regs0;
return !(regs->gpr.rsp & 0x8000000000000000);
}
void void
check_signal(unsigned long rc, void *regs0, int num) check_signal(unsigned long rc, void *regs0, int num)
{ {
@ -786,7 +987,7 @@ check_signal(unsigned long rc, void *regs0, int num)
return; return;
} }
if(regs != NULL && (regs->gpr.rsp & 0x8000000000000000)) { if(regs != NULL && !interrupt_from_user(regs)) {
return; return;
} }
@ -1057,9 +1258,10 @@ done:
/* Wake up the target only when stopped by ptrace-reporting */ /* Wake up the target only when stopped by ptrace-reporting */
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED); sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED);
} }
else if(sig == SIGCONT || ptracecont){ else if(sig == SIGCONT || ptracecont == 1){
/* Wake up the target only when stopped by SIGSTOP */ /* Wake up the target only when stopped by SIGSTOP */
sched_wakeup_thread(tthread, PS_STOPPED); sched_wakeup_thread(tthread, PS_STOPPED);
tthread->proc->status = PS_RUNNING;
} }
} }
} }
@ -1083,3 +1285,505 @@ set_signal(int sig, void *regs0, siginfo_t *info)
} }
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0); do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
} }
SYSCALL_DECLARE(mmap)
{
const int supported_flags = 0
| MAP_SHARED // 01
| MAP_PRIVATE // 02
| MAP_FIXED // 10
| MAP_ANONYMOUS // 20
| MAP_LOCKED // 2000
| MAP_POPULATE // 8000
| MAP_HUGETLB // 00040000
| (0x3F << MAP_HUGE_SHIFT) // FC000000
;
const int ignored_flags = 0
#ifdef USE_NOCACHE_MMAP
| MAP_32BIT // 40
#endif /* USE_NOCACHE_MMAP */
| MAP_DENYWRITE // 0800
| MAP_NORESERVE // 4000
| MAP_STACK // 00020000
;
const int error_flags = 0
#ifndef USE_NOCACHE_MMAP
| MAP_32BIT // 40
#endif /* ndef USE_NOCACHE_MMAP */
| MAP_GROWSDOWN // 0100
| MAP_EXECUTABLE // 1000
| MAP_NONBLOCK // 00010000
;
const intptr_t addr0 = ihk_mc_syscall_arg0(ctx);
const size_t len0 = ihk_mc_syscall_arg1(ctx);
const int prot = ihk_mc_syscall_arg2(ctx);
const int flags0 = ihk_mc_syscall_arg3(ctx);
const int fd = ihk_mc_syscall_arg4(ctx);
const off_t off0 = ihk_mc_syscall_arg5(ctx);
struct thread *thread = cpu_local_var(current);
struct vm_regions *region = &thread->vm->region;
int error;
intptr_t addr;
size_t len;
int flags = flags0;
size_t pgsize;
dkprintf("sys_mmap(%lx,%lx,%x,%x,%d,%lx)\n",
addr0, len0, prot, flags0, fd, off0);
/* check constants for flags */
if (1) {
int dup_flags;
dup_flags = (supported_flags & ignored_flags);
dup_flags |= (ignored_flags & error_flags);
dup_flags |= (error_flags & supported_flags);
if (dup_flags) {
ekprintf("sys_mmap:duplicate flags: %lx\n", dup_flags);
ekprintf("s-flags: %08x\n", supported_flags);
ekprintf("i-flags: %08x\n", ignored_flags);
ekprintf("e-flags: %08x\n", error_flags);
panic("sys_mmap:duplicate flags\n");
/* no return */
}
}
/* check arguments */
pgsize = PAGE_SIZE;
if (flags & MAP_HUGETLB) {
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
case 0:
flags |= MAP_HUGE_2MB; /* default hugepage size */
break;
case MAP_HUGE_2MB:
case MAP_HUGE_1GB:
break;
default:
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):"
"not supported page size.\n",
addr0, len0, prot, flags0, fd, off0);
error = -EINVAL;
goto out;
}
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
}
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR;
len = (len0 + pgsize - 1) & ~(pgsize - 1);
if ((addr & (pgsize - 1))
|| (len == 0)
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|| (off0 & (pgsize - 1))) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
addr0, len0, prot, flags0, fd, off0);
error = -EINVAL;
goto out;
}
if ((addr < region->user_start)
|| (region->user_end <= addr)
|| ((region->user_end - addr) < len)) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
addr0, len0, prot, flags0, fd, off0);
error = -ENOMEM;
goto out;
}
/* check not supported requests */
if ((flags & error_flags)
|| (flags & ~(supported_flags | ignored_flags))) {
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):unknown flags %x\n",
addr0, len0, prot, flags0, fd, off0,
(flags & ~(supported_flags | ignored_flags)));
error = -EINVAL;
goto out;
}
addr = do_mmap(addr, len, prot, flags, fd, off0);
error = 0;
out:
dkprintf("sys_mmap(%lx,%lx,%x,%x,%d,%lx): %ld %lx\n",
addr0, len0, prot, flags0, fd, off0, error, addr);
return (!error)? addr: error;
}
SYSCALL_DECLARE(clone)
{
return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
ihk_mc_syscall_sp(ctx));
}
SYSCALL_DECLARE(shmget)
{
const key_t key = ihk_mc_syscall_arg0(ctx);
const size_t size = ihk_mc_syscall_arg1(ctx);
const int shmflg0 = ihk_mc_syscall_arg2(ctx);
int shmid;
int error;
int shmflg = shmflg0;
dkprintf("shmget(%#lx,%#lx,%#x)\n", key, size, shmflg0);
if (shmflg & SHM_HUGETLB) {
switch (shmflg & (0x3F << SHM_HUGE_SHIFT)) {
case 0:
shmflg |= SHM_HUGE_2MB; /* default hugepage size */
break;
case SHM_HUGE_2MB:
case SHM_HUGE_1GB:
break;
default:
error = -EINVAL;
goto out;
}
}
shmid = do_shmget(key, size, shmflg);
error = 0;
out:
dkprintf("shmget(%#lx,%#lx,%#x): %d %d\n", key, size, shmflg0, error, shmid);
return (error)?: shmid;
} /* sys_shmget() */
long do_arch_prctl(unsigned long code, unsigned long address)
{
int err = 0;
enum ihk_asr_type type;
switch (code) {
case ARCH_SET_FS:
case ARCH_GET_FS:
type = IHK_ASR_X86_FS;
break;
case ARCH_GET_GS:
type = IHK_ASR_X86_GS;
break;
case ARCH_SET_GS:
return -ENOTSUPP;
default:
return -EINVAL;
}
switch (code) {
case ARCH_SET_FS:
dkprintf("[%d] arch_prctl: ARCH_SET_FS: 0x%lX\n",
ihk_mc_get_processor_id(), address);
cpu_local_var(current)->tlsblock_base = address;
err = ihk_mc_arch_set_special_register(type, address);
break;
case ARCH_SET_GS:
err = ihk_mc_arch_set_special_register(type, address);
break;
case ARCH_GET_FS:
case ARCH_GET_GS:
err = ihk_mc_arch_get_special_register(type,
(unsigned long*)address);
break;
default:
break;
}
return err;
}
SYSCALL_DECLARE(arch_prctl)
{
return do_arch_prctl(ihk_mc_syscall_arg0(ctx),
ihk_mc_syscall_arg1(ctx));
}
static int vdso_get_vdso_info(void)
{
int error;
struct ikc_scd_packet packet;
struct ihk_ikc_channel_desc *ch = cpu_local_var(syscall_channel);
dkprintf("vdso_get_vdso_info()\n");
vdso.busy = 1;
vdso.vdso_npages = 0;
packet.msg = SCD_MSG_GET_VDSO_INFO;
packet.arg = virt_to_phys(&vdso);
error = ihk_ikc_send(ch, &packet, 0);
if (error) {
ekprintf("vdso_get_vdso_info: ihk_ikc_send failed. %d\n", error);
goto out;
}
while (vdso.busy) {
cpu_pause();
}
error = 0;
out:
if (error) {
vdso.vdso_npages = 0;
}
dkprintf("vdso_get_vdso_info(): %d\n", error);
return error;
} /* vdso_get_vdso_info() */
static int vdso_map_global_pages(void)
{
int error;
enum ihk_mc_pt_attribute attr;
int i;
void *virt;
intptr_t phys;
dkprintf("vdso_map_global_pages()\n");
if (vdso.vvar_virt && vdso.vvar_is_global) {
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE;
error = ihk_mc_pt_set_page(NULL, vdso.vvar_virt, vdso.vvar_phys, attr);
if (error) {
ekprintf("vdso_map_global_pages: mapping vvar failed. %d\n", error);
goto out;
}
}
if (vdso.hpet_virt && vdso.hpet_is_global) {
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE | PTATTR_UNCACHABLE;
error = ihk_mc_pt_set_page(NULL, vdso.hpet_virt, vdso.hpet_phys, attr);
if (error) {
ekprintf("vdso_map_global_pages: mapping hpet failed. %d\n", error);
goto out;
}
}
if (vdso.pvti_virt && vdso.pvti_is_global) {
error = arch_setup_pvclock();
if (error) {
ekprintf("vdso_map_global_pages: arch_setup_pvclock failed. %d\n", error);
goto out;
}
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE;
for (i = 0; i < pvti_npages; ++i) {
virt = vdso.pvti_virt - (i * PAGE_SIZE);
phys = virt_to_phys(pvti + (i * PAGE_SIZE));
error = ihk_mc_pt_set_page(NULL, virt, phys, attr);
if (error) {
ekprintf("vdso_map_global_pages: mapping pvti failed. %d\n", error);
goto out;
}
}
}
error = 0;
out:
dkprintf("vdso_map_global_pages(): %d\n", error);
return error;
} /* vdso_map_global_pages() */
static void vdso_calc_container_size(void)
{
intptr_t start, end;
intptr_t s, e;
dkprintf("vdso_calc_container_size()\n");
start = 0;
end = vdso.vdso_npages * PAGE_SIZE;
if (vdso.vvar_virt && !vdso.vvar_is_global) {
s = (intptr_t)vdso.vvar_virt;
e = s + PAGE_SIZE;
if (s < start) {
start = s;
}
if (end < e) {
end = e;
}
}
if (vdso.hpet_virt && !vdso.hpet_is_global) {
s = (intptr_t)vdso.hpet_virt;
e = s + PAGE_SIZE;
if (s < start) {
start = s;
}
if (end < e) {
end = e;
}
}
if (vdso.pvti_virt && !vdso.pvti_is_global) {
s = (intptr_t)vdso.pvti_virt;
e = s + PAGE_SIZE;
if (s < start) {
start = s;
}
if (end < e) {
end = e;
}
}
vdso_offset = 0;
if (start < 0) {
vdso_offset = -start;
}
container_size = end - start;
dkprintf("vdso_calc_container_size(): %#lx %#lx\n", container_size, vdso_offset);
return;
} /* vdso_calc_container_size() */
int arch_setup_vdso()
{
int error;
dkprintf("arch_setup_vdso()\n");
error = vdso_get_vdso_info();
if (error) {
ekprintf("arch_setup_vdso: vdso_get_vdso_info failed. %d\n", error);
goto out;
}
if (vdso.vdso_npages <= 0) {
error = 0;
goto out;
}
error = vdso_map_global_pages();
if (error) {
ekprintf("arch_setup_vdso: vdso_map_global_pages failed. %d\n", error);
goto out;
}
vdso_calc_container_size();
error = 0;
out:
if (container_size > 0) {
kprintf("vdso is enabled\n");
}
else {
kprintf("vdso is disabled\n");
}
dkprintf("arch_setup_vdso(): %d\n", error);
return error;
} /* arch_setup_vdso() */
int arch_map_vdso(struct process_vm *vm)
{
struct address_space *as = vm->address_space;
page_table_t pt = as->page_table;
void *container;
void *s;
void *e;
unsigned long vrflags;
enum ihk_mc_pt_attribute attr;
int error;
int i;
dkprintf("arch_map_vdso()\n");
if (container_size <= 0) {
/* vdso pages are not available */
dkprintf("arch_map_vdso(): not available\n");
error = 0;
goto out;
}
container = (void *)vm->region.map_end;
vm->region.map_end += container_size;
s = container + vdso_offset;
e = s + (vdso.vdso_npages * PAGE_SIZE);
vrflags = VR_REMOTE;
vrflags |= VR_PROT_READ | VR_PROT_EXEC;
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
if (error) {
ekprintf("ERROR: adding memory range for vdso. %d\n", error);
goto out;
}
vm->vdso_addr = s;
attr = PTATTR_ACTIVE | PTATTR_USER;
for (i = 0; i < vdso.vdso_npages; ++i) {
s = vm->vdso_addr + (i * PAGE_SIZE);
e = s + PAGE_SIZE;
error = ihk_mc_pt_set_range(pt, vm, s, e,
vdso.vdso_physlist[i], attr, 0);
if (error) {
ekprintf("ihk_mc_pt_set_range failed. %d\n", error);
goto out;
}
}
if (container_size > (vdso.vdso_npages * PAGE_SIZE)) {
if (vdso_offset) {
s = container;
e = container + vdso_offset;
}
else {
s = container + (vdso.vdso_npages * PAGE_SIZE);
e = container + container_size;
}
vrflags = VR_REMOTE;
vrflags |= VR_PROT_READ;
vrflags |= VRFLAG_PROT_TO_MAXPROT(vrflags);
error = add_process_memory_range(vm, (intptr_t)s, (intptr_t)e, NOPHYS, vrflags, NULL, 0, PAGE_SHIFT);
if (error) {
ekprintf("ERROR: adding memory range for vvar. %d\n", error);
goto out;
}
vm->vvar_addr = s;
if (vdso.vvar_virt && !vdso.vvar_is_global) {
s = vm->vdso_addr + (intptr_t)vdso.vvar_virt;
e = s + PAGE_SIZE;
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE;
error = ihk_mc_pt_set_range(pt, vm, s, e,
vdso.vvar_phys, attr, 0);
if (error) {
ekprintf("ihk_mc_pt_set_range failed. %d\n", error);
goto out;
}
}
if (vdso.hpet_virt && !vdso.hpet_is_global) {
s = vm->vdso_addr + (intptr_t)vdso.hpet_virt;
e = s + PAGE_SIZE;
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE | PTATTR_UNCACHABLE;
error = ihk_mc_pt_set_range(pt, vm, s, e,
vdso.hpet_phys, attr, 0);
if (error) {
ekprintf("ihk_mc_pt_set_range failed. %d\n", error);
goto out;
}
}
if (vdso.pvti_virt && !vdso.pvti_is_global) {
s = vm->vdso_addr + (intptr_t)vdso.pvti_virt;
e = s + PAGE_SIZE;
attr = PTATTR_ACTIVE | PTATTR_USER | PTATTR_NO_EXECUTE;
error = ihk_mc_pt_set_range(pt, vm, s, e,
vdso.pvti_phys, attr, 0);
if (error) {
ekprintf("ihk_mc_pt_set_range failed. %d\n", error);
goto out;
}
}
}
error = 0;
out:
dkprintf("arch_map_vdso(): %d %p\n", error, vm->vdso_addr);
return error;
} /* arch_map_vdso() */
/*** End of File ***/

View File

@ -17,11 +17,60 @@ BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@" SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@" KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@" KERNDIR="@KERNDIR@"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
INTERVAL=1
LOGMODE=0
while getopts :i:k: OPT
do
case ${OPT} in
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value"
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value"
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value"
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value"
exit 1
fi
;;
*) echo "invalid option -${OPT}"
exit 1
esac
done
mem="512M@0" mem="512M@0"
cpus="" cpus=""
ihk_ikc_irq_core=0 ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then rhel_release=""; fi
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay=`if ( [ ${linux_version_code} -ge 262144 ] && [ ${linux_version_code} -lt 262400 ] ); then echo "yes"; else echo "no"; fi`
else
enable_mcoverlay=no
fi
if [ "$cpus" == "" ]; then if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0 # Get the number of CPUs on NUMA node 0
@ -38,6 +87,18 @@ if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi if ! rmmod mcctrl; then echo "error: removing mcctrl"; exit; fi
fi fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then echo "error: removing mcoverlay"; exit; fi
fi
fi
# Load IHK if not loaded # Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi; if ! insmod ${KMODDIR}/ihk.ko; then echo "error: loading ihk"; exit; fi;
@ -92,7 +153,47 @@ if ! ${SBINDIR}/ihkconfig 0 create; then echo "error: create"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then echo "error: assign CPUs"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then echo "error: assign memory"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then echo "error: loading kernel image"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 kargs hidos; then echo "error: setting kernel arguments"; exit; fi if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE}"; then echo "error: setting kernel arguments"; exit; fi
if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi if ! ${SBINDIR}/ihkosctl 0 boot; then echo "error: booting"; exit; fi
if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi if ! insmod ${KMODDIR}/mcctrl.ko; then echo "error: inserting mcctrl.ko"; exit; fi
if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi if ! chown `logname` /dev/mcd* /dev/mcos*; then echo "error: chowning device files"; exit; fi
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then echo "error: mount /tmp/mcos"; exit; fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then echo "error: mount /tmp/mcos/linux_proc"; exit; fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then echo "error: inserting mcoverlay.ko"; exit; fi
while [ ! -e /proc/mcos0 ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then echo "error: mount /tmp/mcos/mcos0_proc"; exit; fi
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys ]
do
sleep 1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then echo "error: mount /tmp/mcos/mcos0_sys"; exit; fi
mount --make-rprivate /sys
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
fi
done
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
if [ ${LOGMODE} -ne 0 ]
then
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL}
fi

2473
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -27,10 +27,27 @@ AC_ARG_WITH([target],
[--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]), [--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]),
[WITH_TARGET=$withval],[WITH_TARGET=yes]) [WITH_TARGET=$withval],[WITH_TARGET=yes])
AC_ARG_WITH([system_map],
AS_HELP_STRING(
[--with-system_map=path],[Path to 'System.map file', default is /boot/System.map-uname_r]),
[WITH_SYSTEM_MAP=$withval],[WITH_SYSTEM_MAP=yes])
AC_ARG_ENABLE([dcfa], AC_ARG_ENABLE([dcfa],
[AS_HELP_STRING( [AS_HELP_STRING(
[--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no]) [--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no])
AC_ARG_ENABLE([memdump],
AC_HELP_STRING([--enable-memdump],
[enable dumping memory and analyzing a dump]),
[ENABLE_MEMDUMP=$enableval],
[ENABLE_MEMDUMP=default])
AC_ARG_ENABLE([mcoverlayfs],
AC_HELP_STRING([--enable-mcoverlayfs],
[enable mcoverlayfs implementation]),
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
case "X$WITH_KERNELSRC" in case "X$WITH_KERNELSRC" in
Xyes | Xno | X) Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build' WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@ -49,9 +66,26 @@ fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix" test "x$prefix" = xNONE && prefix="$ac_default_prefix"
case $WITH_TARGET in case $WITH_TARGET in
attached-mic) attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m` ARCH=`uname -m`
AC_PROG_CC AC_PROG_CC
XCC=$CC
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
esac
case $WITH_TARGET in
attached-mic)
if test "X$KERNDIR" = X; then if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel" KERNDIR="$prefix/attached/kernel"
fi fi
@ -69,12 +103,6 @@ case $WITH_TARGET in
fi fi
;; ;;
builtin-mic) builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
if test "X$KERNDIR" = X; then if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel" KERNDIR="$prefix/attached/kernel"
fi fi
@ -92,9 +120,6 @@ case $WITH_TARGET in
fi fi
;; ;;
builtin-x86) builtin-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel" KERNDIR="$prefix/attached/kernel"
fi fi
@ -112,9 +137,6 @@ case $WITH_TARGET in
fi fi
;; ;;
smp-x86) smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then if test "X$KERNDIR" = X; then
KERNDIR="$prefix/smp-x86/kernel" KERNDIR="$prefix/smp-x86/kernel"
fi fi
@ -139,6 +161,116 @@ esac
KDIR="$WITH_KERNELSRC" KDIR="$WITH_KERNELSRC"
TARGET="$WITH_TARGET" TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
case "X$WITH_SYSTEM_MAP" in
Xyes | Xno | X)
MCCTRL_LINUX_SYMTAB=""
;;
*)
MCCTRL_LINUX_SYMTAB="$WITH_SYSTEM_MAP"
;;
esac
AC_MSG_CHECKING([[for System.map]])
if test -f "$MCCTRL_LINUX_SYMTAB"; then
MCCTRL_LINUX_SYMTAB="$MCCTRL_LINUX_SYMTAB"
elif test -f "/boot/System.map-`uname -r`"; then
MCCTRL_LINUX_SYMTAB="/boot/System.map-`uname -r`"
elif test -f "$KDIR/System.map"; then
MCCTRL_LINUX_SYMTAB="$KDIR/System.map"
fi
if test "$MCCTRL_LINUX_SYMTAB" == ""; then
AC_MSG_ERROR([could not find])
fi
if test -z "`eval cat $MCCTRL_LINUX_SYMTAB`"; then
AC_MSG_ERROR([could not read System.map file, no read permission?])
fi
AC_MSG_RESULT([$MCCTRL_LINUX_SYMTAB])
MCCTRL_LINUX_SYMTAB_CMD="cat $MCCTRL_LINUX_SYMTAB"
# MCCTRL_FIND_KSYM(SYMBOL)
# ------------------------------------------------------
# Search System.map for address of the given symbol and
# do one of three things in config.h:
# If not found, leave MCCTRL_KSYM_foo undefined
# If found to be exported, "#define MCCTRL_KSYM_foo 0"
# If found not to be exported, "#define MCCTRL_KSYM_foo 0x<value>"
AC_DEFUN([MCCTRL_FIND_KSYM],[
AC_MSG_CHECKING([[System.map for symbol $1]])
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " $1\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
AC_MSG_RESULT([not found])
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
m4_ifval([$2],[],[
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_$1\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
])
AC_MSG_RESULT([$mcctrl_result])
AC_DEFINE_UNQUOTED(MCCTRL_KSYM_[]$1,$mcctrl_addr,[Define to address of kernel symbol $1, or 0 if exported])
fi
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
MCCTRL_FIND_KSYM([vdso_start])
MCCTRL_FIND_KSYM([vdso_end])
MCCTRL_FIND_KSYM([vdso_pages])
MCCTRL_FIND_KSYM([__vvar_page])
MCCTRL_FIND_KSYM([hpet_address])
MCCTRL_FIND_KSYM([hv_clock])
MCCTRL_FIND_KSYM([sys_readlink])
case $ENABLE_MEMDUMP in
yes|no|auto)
;;
default)
if test "x$WITH_TARGET" = "xsmp-x86" ; then
ENABLE_MEMDUMP=auto
else
ENABLE_MEMDUMP=no
fi
;;
*)
AC_MSG_ERROR([unknown memdump argument: $ENABLE_MEMDUMP])
;;
esac
if test "x$ENABLE_MEMDUMP" != "xno" ; then
enableval=yes
AC_CHECK_LIB([bfd],[bfd_init],[],[enableval=no])
AC_CHECK_HEADER([bfd.h],[],[enableval=no])
if test "x$ENABLE_MEMDUMP" = "xyes" -a "x$enableval" = "xno" ; then
AC_MSG_ERROR([memdump feature needs bfd.h and libbfd a.k.a bunutils-devel])
fi
ENABLE_MEMDUMP=$enableval
fi
if test "x$ENABLE_MEMDUMP" = "xyes" ; then
AC_MSG_NOTICE([memdump feature is enabled])
AC_DEFINE([ENABLE_MEMDUMP],[1],[whether memdump feature is enabled])
uncomment_if_ENABLE_MEMDUMP=''
else
AC_MSG_NOTICE([memdump feature is disabled])
uncomment_if_ENABLE_MEMDUMP='#'
fi
if test "x$ENABLE_MCOVERLAYFS" = "xyes" ; then
AC_DEFINE([ENABLE_MCOVERLAYFS],[1],[whether mcoverlayfs is enabled])
AC_MSG_NOTICE([mcoverlayfs is enabled])
else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
AC_SUBST(CC) AC_SUBST(CC)
AC_SUBST(XCC) AC_SUBST(XCC)
AC_SUBST(ARCH) AC_SUBST(ARCH)
@ -149,6 +281,7 @@ AC_SUBST(SBINDIR)
AC_SUBST(KMODDIR) AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR) AC_SUBST(KERNDIR)
AC_SUBST(MANDIR) AC_SUBST(MANDIR)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION) AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION) AC_SUBST(MCKERNEL_VERSION)
@ -156,11 +289,15 @@ AC_SUBST(DCFA_VERSION)
AC_SUBST(IHK_RELEASE_DATE) AC_SUBST(IHK_RELEASE_DATE)
AC_SUBST(MCKERNEL_RELEASE_DATE) AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE) AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_FILES([ AC_CONFIG_FILES([
Makefile Makefile
executer/user/Makefile executer/user/Makefile
executer/kernel/Makefile executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
kernel/Makefile kernel/Makefile
kernel/Makefile.build kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcreboot-attached-mic.sh

91
executer/config.h.in Normal file
View File

@ -0,0 +1,91 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bfd' library (-lbfd). */
#undef HAVE_LIBBFD
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to address of kernel symbol __vvar_page, or 0 if exported */
#undef MCCTRL_KSYM___vvar_page
/* Define to address of kernel symbol hpet_address, or 0 if exported */
#undef MCCTRL_KSYM_hpet_address
/* Define to address of kernel symbol hv_clock, or 0 if exported */
#undef MCCTRL_KSYM_hv_clock
/* Define to address of kernel symbol sys_mount, or 0 if exported */
#undef MCCTRL_KSYM_sys_mount
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
/* Define to address of kernel symbol vdso_end, or 0 if exported */
#undef MCCTRL_KSYM_vdso_end
/* Define to address of kernel symbol vdso_image_64, or 0 if exported */
#undef MCCTRL_KSYM_vdso_image_64
/* Define to address of kernel symbol vdso_pages, or 0 if exported */
#undef MCCTRL_KSYM_vdso_pages
/* Define to address of kernel symbol vdso_start, or 0 if exported */
#undef MCCTRL_KSYM_vdso_start
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

View File

@ -48,6 +48,9 @@
#define MCEXEC_UP_OPEN_EXEC 0x30a02912 #define MCEXEC_UP_OPEN_EXEC 0x30a02912
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913 #define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UNSHARE 0x30a02915
#define MCEXEC_UP_DEBUG_LOG 0x40000000 #define MCEXEC_UP_DEBUG_LOG 0x40000000
#define MCEXEC_UP_TRANSFER_TO_REMOTE 0 #define MCEXEC_UP_TRANSFER_TO_REMOTE 0
@ -83,6 +86,9 @@ struct program_load_desc {
int stack_prot; int stack_prot;
int pgid; int pgid;
int cred[8]; int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry; unsigned long entry;
unsigned long user_start; unsigned long user_start;
unsigned long user_end; unsigned long user_end;
@ -166,4 +172,16 @@ struct newprocess_desc {
int pid; int pid;
}; };
struct sys_mount_desc {
char *dev_name;
char *dir_name;
char *type;
unsigned long flags;
void *data;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
#endif #endif

View File

@ -1,26 +0,0 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/include -I$(src)/../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\"
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1,27 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1 @@
# dummy file

View File

@ -0,0 +1,194 @@
#include <linux/version.h>
#include "../../../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
#if MCCTRL_KSYM_vdso_image_64
struct vdso_image *vdso_image = (void *)MCCTRL_KSYM_vdso_image_64;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_start
#if MCCTRL_KSYM_vdso_start
void *vdso_start = (void *)MCCTRL_KSYM_vdso_start;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_end
#if MCCTRL_KSYM_vdso_end
void *vdso_end = (void *)MCCTRL_KSYM_vdso_end;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_pages
#if MCCTRL_KSYM_vdso_pages
struct page **vdso_pages = (void *)MCCTRL_KSYM_vdso_pages;
#endif
#endif
#ifdef MCCTRL_KSYM___vvar_page
#if MCCTRL_KSYM___vvar_page
void *__vvar_page = (void *)MCCTRL_KSYM___vvar_page;
#endif
#endif
long *hpet_addressp
#ifdef MCCTRL_KSYM_hpet_address
#if MCCTRL_KSYM_hpet_address
= (void *)MCCTRL_KSYM_hpet_address;
#else
= &hpet_address;
#endif
#else
= NULL;
#endif
void **hv_clockp
#ifdef MCCTRL_KSYM_hv_clock
#if MCCTRL_KSYM_hv_clock
= (void *)MCCTRL_KSYM_hv_clock;
#else
= &hv_clock;
#endif
#else
= NULL;
#endif
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
int
reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
{
struct vm_area_struct *vma;
unsigned long start = 0L;
unsigned long end;
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
start = reserve_user_space_common(usrdata, start, end);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
if (IS_ERR_VALUE(start)) {
return start;
}
*startp = start;
*endp = end;
return 0;
}
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
long vdso_pa;
struct vdso *vdso;
size_t size;
int i;
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
memset(vdso, 0, sizeof(*vdso));
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = virt_to_phys(
vdso_image->data + (i * PAGE_SIZE));
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
size = vdso_end - vdso_start;
size = (size + PAGE_SIZE - 1) & PAGE_MASK;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]);
}
#endif
/* VVAR page */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-3 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-2 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
vdso->vvar_is_global = 1;
vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#endif
/* HPET page */
if (hpet_addressp && *hpet_addressp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-2 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-1 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
vdso->hpet_is_global = 1;
vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET);
vdso->hpet_phys = *hpet_addressp;
#endif
}
/* struct pvlock_vcpu_time_info table */
if (hv_clockp && *hv_clockp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->pvti_is_global = 0;
vdso->pvti_virt = (void *)(-1 * PAGE_SIZE);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
vdso->pvti_is_global = 1;
vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#endif
}
out:
wmb();
vdso->busy = 0;
ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso));
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */

View File

@ -45,7 +45,6 @@ static int load_elf(struct linux_binprm *bprm
#endif #endif
) )
{ {
char mcexec[BINPRM_BUF_SIZE];
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
const const
#endif #endif
@ -60,12 +59,8 @@ static int load_elf(struct linux_binprm *bprm
int l; int l;
} envdata; } envdata;
envdata env[] = { envdata env[] = {
{.name = "MCEXEC"},
#define env_mcexec (env[0].val)
{.name = "MCEXEC_WL"}, {.name = "MCEXEC_WL"},
#define env_mcexec_wl (env[1].val) #define env_mcexec_wl (env[0].val)
{.name = "MCEXEC_BL"},
#define env_mcexec_bl (env[2].val)
{.name = NULL} {.name = NULL}
}; };
envdata *ep; envdata *ep;
@ -120,9 +115,15 @@ static int load_elf(struct linux_binprm *bprm
for(i = 0, st = 0; mode != 2;){ for(i = 0, st = 0; mode != 2;){
if(st == 0){ if(st == 0){
off = p & ~PAGE_MASK; off = p & ~PAGE_MASK;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
rc = get_user_pages_remote(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#else
rc = get_user_pages(current, bprm->mm, rc = get_user_pages(current, bprm->mm,
bprm->p, 1, 0, 1, bprm->p, 1, 0, 1,
&page, NULL); &page, NULL);
#endif
if(rc <= 0) if(rc <= 0)
return -EFAULT; return -EFAULT;
addr = kmap_atomic(page addr = kmap_atomic(page
@ -190,23 +191,10 @@ static int load_elf(struct linux_binprm *bprm
} }
} }
if(!env_mcexec || !strcmp(env_mcexec, "0") || !strcmp(env_mcexec, "off")) if(env_mcexec_wl)
rc = 1;
else{
rc = 0;
if(strchr(env_mcexec, '/') && strlen(env_mcexec) < BINPRM_BUF_SIZE)
strcpy(mcexec, env_mcexec);
else
strcpy(mcexec, MCEXEC_PATH);
}
if(rc);
else if(env_mcexec_wl)
rc = !pathcheck(path, env_mcexec_wl); rc = !pathcheck(path, env_mcexec_wl);
else if(env_mcexec_bl)
rc = pathcheck(path, env_mcexec_bl);
else else
rc = pathcheck(path, "/usr:/bin:/sbin:/opt"); rc = 1;
for(ep = env; ep->name; ep++) for(ep = env; ep->name; ep++)
if(ep->val) if(ep->val)
@ -214,7 +202,7 @@ static int load_elf(struct linux_binprm *bprm
if(rc) if(rc)
return -ENOEXEC; return -ENOEXEC;
file = open_exec(mcexec); file = open_exec(MCEXEC_PATH);
if (IS_ERR(file)) if (IS_ERR(file))
return -ENOEXEC; return -ENOEXEC;
@ -229,29 +217,18 @@ static int load_elf(struct linux_binprm *bprm
return rc; return rc;
} }
bprm->argc++; bprm->argc++;
wp = mcexec; wp = MCEXEC_PATH;
rc = copy_strings_kernel(1, &wp, bprm); rc = copy_strings_kernel(1, &wp, bprm);
if (rc){ if (rc){
fput(file); fput(file);
return rc; return rc;
} }
bprm->argc++; bprm->argc++;
#if 1 rc = bprm_change_interp(MCEXEC_PATH, bprm);
rc = bprm_change_interp(mcexec, bprm);
if (rc < 0){ if (rc < 0){
fput(file); fput(file);
return rc; return rc;
} }
#else
if(brpm->interp != bprm->filename)
kfree(brpm->interp);
kfree(brpm->filename);
bprm->filename = bprm->interp = kstrdup(mcexec, GFP_KERNEL);
if(!bprm->interp){
fput(file);
return -ENOMEM;
}
#endif
allow_write_access(bprm->file); allow_write_access(bprm->file);
fput(bprm->file); fput(bprm->file);

View File

@ -34,8 +34,8 @@
#include <linux/version.h> #include <linux/version.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/msr.h>
#include <asm/io.h> #include <asm/io.h>
#include "../../config.h"
#include "mcctrl.h" #include "mcctrl.h"
//#define DEBUG //#define DEBUG
@ -46,6 +46,28 @@
#define dprintk(...) #define dprintk(...)
#endif #endif
#ifdef MCCTRL_KSYM_sys_unshare
#if MCCTRL_KSYM_sys_unshare
typedef int (*int_star_fn_ulong_t)(unsigned long);
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) =
(int_star_fn_ulong_t)
MCCTRL_KSYM_sys_unshare;
#else // exported
int (*mcctrl_sys_unshare)(unsigned long unshare_flags) = NULL;
#endif
#endif
#ifdef MCCTRL_KSYM_sys_mount
#if MCCTRL_KSYM_sys_mount
typedef int (*int_star_fn_char_char_char_ulong_void_t)(char *, char *, char *, unsigned long, void *);
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) =
(int_star_fn_char_char_char_ulong_void_t)
MCCTRL_KSYM_sys_mount;
#else // exported
int (*mcctrl_sys_mount)(char *dev_name,char *dir_name, char *type, unsigned long flags, void *data) = NULL;
#endif
#endif
//static DECLARE_WAIT_QUEUE_HEAD(wq_prepare); //static DECLARE_WAIT_QUEUE_HEAD(wq_prepare);
//extern struct mcctrl_channel *channels; //extern struct mcctrl_channel *channels;
int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu); int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
@ -102,10 +124,10 @@ static long mcexec_prepare_image(ihk_os_t os,
pdesc->args = (void*)virt_to_phys(args); pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args); printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %d\n", *(int*)args); printk("argc: %ld\n", *(long *)args);
pdesc->envs = (void*)virt_to_phys(envs); pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs); printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %d\n", *(int*)envs); printk("envc: %ld\n", *(long *)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS; isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu; isp.ref = pdesc->cpu;
@ -264,12 +286,15 @@ static void release_handler(ihk_os_t os, void *param)
{ {
struct handlerinfo *info = param; struct handlerinfo *info = param;
struct ikc_scd_packet isp; struct ikc_scd_packet isp;
int os_ind = ihk_host_os_get_index(os);
memset(&isp, '\0', sizeof isp); memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_CLEANUP_PROCESS; isp.msg = SCD_MSG_CLEANUP_PROCESS;
isp.pid = info->pid; isp.pid = info->pid;
mcctrl_ikc_send(os, 0, &isp); mcctrl_ikc_send(os, 0, &isp);
if(os_ind >= 0)
delete_pid_entry(os_ind, info->pid);
kfree(param); kfree(param);
} }
@ -480,7 +505,7 @@ retry_alloc:
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock); irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */ /* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) { list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == current->tgid) { if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln); kfree(wqhln);
wqhln = wqhln_iter; wqhln = wqhln_iter;
list_del(&wqhln->list); list_del(&wqhln->list);
@ -507,8 +532,8 @@ retry_alloc:
c->param.request_va->args[0] == swd.pid) { c->param.request_va->args[0] == swd.pid) {
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n", dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
current->tgid, task_tgid_vnr(current),
current->pid, task_pid_vnr(current);
c->param.request_va->number, c->param.request_va->number,
swd.cpu); swd.cpu);
@ -525,12 +550,12 @@ printk("mcexec_wait_syscall:stray wakeup\n");
#else #else
while (1) { while (1) {
c = usrdata->channels + swd.cpu; c = usrdata->channels + swd.cpu;
rdtscll(s); ihk_get_tsc(s);
if (!usrdata->remaining_job) { if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) { while (!(*c->param.doorbell_va)) {
mb(); mb();
cpu_relax(); cpu_relax();
rdtscll(w); ihk_get_tsc(w);
if (w > s + 1024UL * 1024 * 1024 * 10) { if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR; return -EINTR;
} }
@ -834,11 +859,28 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
struct mckernel_exec_file *mcef; struct mckernel_exec_file *mcef;
struct mckernel_exec_file *mcef_iter; struct mckernel_exec_file *mcef_iter;
int retval; int retval;
int os_ind = ihk_host_os_get_index(os);
char *pathbuf, *fullpath;
if (os_ind < 0) {
return EINVAL;
}
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
return ENOMEM;
}
file = open_exec(filename); file = open_exec(filename);
retval = PTR_ERR(file); retval = PTR_ERR(file);
if (IS_ERR(file)) { if (IS_ERR(file)) {
goto out_return; goto out_error_free;
}
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (IS_ERR(fullpath)) {
retval = PTR_ERR(fullpath);
goto out_error_free;
} }
mcef = kmalloc(sizeof(*mcef), GFP_KERNEL); mcef = kmalloc(sizeof(*mcef), GFP_KERNEL);
@ -850,30 +892,37 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
spin_lock_irq(&mckernel_exec_file_lock); spin_lock_irq(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */ /* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) { list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == current->tgid) { if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
allow_write_access(mcef_iter->fp); allow_write_access(mcef_iter->fp);
fput(mcef_iter->fp); fput(mcef_iter->fp);
list_del(&mcef_iter->list); list_del(&mcef_iter->list);
kfree(mcef_iter); kfree(mcef_iter);
dprintk("%d open_exec dropped previous executable \n", (int)current->tgid);
break; break;
} }
} }
/* Add new exec file to the list */ /* Add new exec file to the list */
mcef->os = os; mcef->os = os;
mcef->pid = current->tgid; mcef->pid = task_tgid_vnr(current);
mcef->fp = file; mcef->fp = file;
list_add_tail(&mcef->list, &mckernel_exec_files); list_add_tail(&mcef->list, &mckernel_exec_files);
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
spin_unlock(&mckernel_exec_file_lock); spin_unlock(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)current->tgid, filename); dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current), filename);
kfree(pathbuf);
return 0; return 0;
out_put_file: out_put_file:
fput(file); fput(file);
out_return: out_error_free:
kfree(pathbuf);
return -retval; return -retval;
} }
@ -882,19 +931,25 @@ int mcexec_close_exec(ihk_os_t os)
{ {
struct mckernel_exec_file *mcef = NULL; struct mckernel_exec_file *mcef = NULL;
int found = 0; int found = 0;
int os_ind = ihk_host_os_get_index(os);
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irq(&mckernel_exec_file_lock); spin_lock_irq(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) { list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == current->tgid) { if (mcef->os == os && mcef->pid == task_tgid_vnr(current)) {
allow_write_access(mcef->fp); allow_write_access(mcef->fp);
fput(mcef->fp); fput(mcef->fp);
list_del(&mcef->list); list_del(&mcef->list);
kfree(mcef); kfree(mcef);
found = 1; found = 1;
dprintk("%d close_exec dropped executable \n", (int)current->tgid); dprintk("%d close_exec dropped executable \n", (int)task_tgid_vnr(current));
break; break;
} }
} }
spin_unlock(&mckernel_exec_file_lock); spin_unlock(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL); return (found ? 0 : EINVAL);
@ -952,6 +1007,67 @@ long mcexec_strncpy_from_user(ihk_os_t os, struct strncpy_from_user_desc * __use
return 0; return 0;
} }
long mcexec_sys_mount(struct sys_mount_desc *__user arg)
{
struct sys_mount_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_mount
ret = mcctrl_sys_mount(desc.dev_name, desc.dir_name, desc.type,
desc.flags, desc.data);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long mcexec_sys_unshare(struct sys_unshare_desc *__user arg)
{
struct sys_unshare_desc desc;
struct cred *promoted;
const struct cred *original;
int ret;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
promoted = prepare_creds();
if (!promoted) {
return -ENOMEM;
}
cap_raise(promoted->cap_effective, CAP_SYS_ADMIN);
original = override_creds(promoted);
#if MCCTRL_KSYM_sys_unshare
ret = mcctrl_sys_unshare(desc.unshare_flags);
#else
ret = -EFAULT;
#endif
revert_creds(original);
put_cred(promoted);
return ret;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg, long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
struct file *file) struct file *file)
{ {
@ -1006,6 +1122,12 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_GET_CREDV: case MCEXEC_UP_GET_CREDV:
return mcexec_getcredv((int *)arg); return mcexec_getcredv((int *)arg);
case MCEXEC_UP_SYS_MOUNT:
return mcexec_sys_mount((struct sys_mount_desc *)arg);
case MCEXEC_UP_SYS_UNSHARE:
return mcexec_sys_unshare((struct sys_unshare_desc *)arg);
case MCEXEC_UP_DEBUG_LOG: case MCEXEC_UP_DEBUG_LOG:
return mcexec_debug_log(os, arg); return mcexec_debug_log(os, arg);
} }

View File

@ -25,6 +25,7 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/miscdevice.h> #include <linux/miscdevice.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h" #include "mcctrl.h"
#define OS_MAX_MINOR 64 #define OS_MAX_MINOR 64
@ -67,6 +68,8 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl }, { .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
}; };
@ -79,6 +82,12 @@ static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR]; static ihk_os_t os[OS_MAX_MINOR];
ihk_os_t
osnum_to_os(int n)
{
return os[n];
}
static int __init mcctrl_init(void) static int __init mcctrl_init(void)
{ {
int i; int i;
@ -137,6 +146,8 @@ static void __exit mcctrl_exit(void)
printk("mcctrl: unregistered.\n"); printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){ for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){ if(os[i]){
sysfsm_cleanup(os[i]);
free_topology_info(os[i]);
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i); ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]); destroy_ikc_channels(os[i]);
procfs_exit(i); procfs_exit(i);

View File

@ -41,9 +41,6 @@
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err); void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c); static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg); int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg);
void procfs_delete(void *__os, int osnum, unsigned long arg);
void procfs_answer(unsigned long arg, int err);
void sig_done(unsigned long arg, int err); void sig_done(unsigned long arg, int err);
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
@ -69,14 +66,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg); mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
break; break;
case SCD_MSG_PROCFS_CREATE:
procfs_create(__os, pisp->ref, pisp->osnum, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_DELETE:
procfs_delete(__os, pisp->osnum, pisp->arg);
break;
case SCD_MSG_PROCFS_ANSWER: case SCD_MSG_PROCFS_ANSWER:
procfs_answer(pisp->arg, pisp->err); procfs_answer(pisp->arg, pisp->err);
break; break;
@ -84,6 +73,42 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_SEND_SIGNAL: case SCD_MSG_SEND_SIGNAL:
sig_done(pisp->arg, pisp->err); sig_done(pisp->arg, pisp->err);
break; break;
case SCD_MSG_SYSFS_REQ_CREATE:
case SCD_MSG_SYSFS_REQ_MKDIR:
case SCD_MSG_SYSFS_REQ_SYMLINK:
case SCD_MSG_SYSFS_REQ_LOOKUP:
case SCD_MSG_SYSFS_REQ_UNLINK:
case SCD_MSG_SYSFS_REQ_SETUP:
case SCD_MSG_SYSFS_RESP_SHOW:
case SCD_MSG_SYSFS_RESP_STORE:
case SCD_MSG_SYSFS_RESP_RELEASE:
sysfsm_packet_handler(__os, pisp->msg, pisp->err,
pisp->sysfs_arg1, pisp->sysfs_arg2);
break;
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(__os), pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_REPLY_GET_CPU_MAPPING:
reply_get_cpu_mapping(pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
pisp->msg, pisp->ref, pisp->osnum, pisp->pid,
pisp->err, pisp->arg);
break;
} }
return 0; return 0;
} }
@ -325,6 +350,9 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->per_proc_list); INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock); spin_lock_init(&usrdata->per_proc_list_lock);
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
error = init_peer_channel_registry(usrdata); error = init_peer_channel_registry(usrdata);
if (error) { if (error) {
return error; return error;
@ -368,6 +396,7 @@ void destroy_ikc_channels(ihk_os_t os)
} }
free_page((unsigned long)usrdata->mcctrl_doorbell_va); free_page((unsigned long)usrdata->mcctrl_doorbell_va);
destroy_peer_channel_registry(usrdata);
kfree(usrdata->channels); kfree(usrdata->channels);
kfree(usrdata); kfree(usrdata);
} }

View File

@ -32,11 +32,17 @@
#ifndef HEADER_MCCTRL_H #ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H #define HEADER_MCCTRL_H
#include <linux/fs.h>
#include <ihk/ihk_host_driver.h> #include <ihk/ihk_host_driver.h>
#include <linux/resource.h>
#include <uprotocol.h> #include <uprotocol.h>
#include <linux/wait.h> #include <linux/wait.h>
#include <ihk/ikc.h> #include <ihk/ikc.h>
#include <ikc/master.h> #include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/threads.h>
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1 #define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2 #define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
@ -49,6 +55,10 @@
#define SCD_MSG_SYSCALL_ONESIDE 0x4 #define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8 #define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9 #define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10 #define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11 #define SCD_MSG_PROCFS_DELETE 0x11
@ -57,6 +67,29 @@
#define SCD_MSG_DEBUG_LOG 0x20 #define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define DMA_PIN_SHIFT 21 #define DMA_PIN_SHIFT 21
#define DO_USER_MODE #define DO_USER_MODE
@ -70,11 +103,24 @@ struct coretable {
struct ikc_scd_packet { struct ikc_scd_packet {
int msg; int msg;
int ref;
int osnum;
int pid;
int err; int err;
unsigned long arg; union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
}; };
struct mcctrl_priv { struct mcctrl_priv {
@ -128,6 +174,62 @@ struct mcctrl_per_proc_data {
unsigned long rpgtable; /* per process, not per OS */ unsigned long rpgtable; /* per process, not per OS */
}; };
struct sysfsm_req {
int busy;
int padding;
long lresult;
wait_queue_head_t wq;
};
struct sysfsm_data {
size_t sysfs_bufsize;
void *sysfs_buf;
long sysfs_buf_rpa;
long sysfs_buf_pa;
struct kobject *sysfs_kobj;
struct sysfsm_node *sysfs_root;
struct semaphore sysfs_tree_sem;
struct semaphore sysfs_io_sem;
struct sysfsm_req sysfs_req;
ihk_os_t sysfs_os;
};
static inline int sysfs_inited(struct sysfsm_data *sdp)
{
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
struct list_head chain;
};
struct cpu_topology {
struct cpu_mapping *cpu_mapping;
struct ihk_cpu_topology *saved;
cpumask_t core_siblings;
cpumask_t thread_siblings;
struct list_head chain;
struct list_head cache_list;
};
struct node_topology {
struct ihk_node_topology *saved;
cpumask_t cpumap;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
struct mcctrl_usrdata { struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param; struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2; struct ihk_ikc_listen_param listen_param2;
@ -146,6 +248,14 @@ struct mcctrl_usrdata {
struct list_head per_proc_list; struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock; ihk_spinlock_t per_proc_list_lock;
void **keys; void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
int cpu_mapping_elems;
int padding;
struct cpu_mapping *cpu_mapping;
long cpu_mapping_pa;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
}; };
struct mcctrl_signal { struct mcctrl_signal {
@ -159,11 +269,12 @@ struct mcctrl_signal {
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp); int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg); int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu); int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp); ihk_os_t osnum_to_os(int n);
/* syscall.c */ /* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud); int init_peer_channel_registry(struct mcctrl_usrdata *ud);
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch); int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key); struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
@ -179,6 +290,7 @@ struct procfs_read {
int ret; /* read bytes (answer) */ int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */ int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */ int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */ char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
}; };
@ -188,4 +300,51 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */ char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
}; };
void procfs_answer(unsigned int arg, int err);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);
void delete_pid_entry(int osnum, int pid);
void proc_exe_link(int osnum, int pid, const char *path);
void procfs_init(int osnum);
void procfs_exit(int osnum);
/* sysfs_files.c */
void setup_sysfs_files(ihk_os_t os);
void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
void get_vdso_info(ihk_os_t os, long vdso_pa);
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
wait_queue_head_t wq;
};
#endif #endif

View File

@ -0,0 +1,791 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include "mcctrl.h"
#include <linux/version.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
typedef uid_t kuid_t;
typedef gid_t kgid_t;
#endif
struct procfs_entry {
char *name;
mode_t mode;
const struct file_operations *fops;
};
#define NOD(NAME, MODE, FOP) { \
.name = (NAME), \
.mode = MODE, \
.fops = FOP, \
}
#define PROC_DIR(NAME, MODE) \
NOD(NAME, (S_IFDIR|(MODE)), NULL)
#define PROC_REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), fops)
#define PROC_TERM \
NOD(NULL, 0, NULL)
static const struct procfs_entry tid_entry_stuff[];
static const struct procfs_entry pid_entry_stuff[];
static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
struct list_head children;
int osnum;
char *data;
char name[0];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
{
char *w = buf + bufsize - 1;
*w = '\0';
for(;;){
int l = strlen(e->name);
w -= l;
memcpy(w, e->name, l);
e = e->parent;
if(!e)
return w;
w--;
*w = '/';
}
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void
procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
static struct procfs_list_entry *
find_procfs_entry(struct procfs_list_entry *parent, const char *name)
{
struct list_head *list;
struct procfs_list_entry *e;
if(parent == NULL)
list = &procfs_file_list;
else
list = &parent->children;
list_for_each_entry(e, list, list) {
if(!strcmp(e->name, name))
return e;
}
return NULL;
}
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *n;
list_del(&top->list);
list_for_each_entry_safe(e, n, &top->children, list) {
delete_procfs_entries(e);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)
kfree(top->data);
kfree(top);
}
static struct procfs_list_entry *
add_procfs_entry(struct procfs_list_entry *parent, const char *name, int mode,
kuid_t uid, kgid_t gid, const void *opaque)
{
struct procfs_list_entry *e = find_procfs_entry(parent, name);
struct proc_dir_entry *pde;
struct proc_dir_entry *parent_pde = NULL;
int f_mode = mode & 0777;
if(e)
delete_procfs_entries(e);
e = kmalloc(sizeof(struct procfs_list_entry) + strlen(name) + 1,
GFP_KERNEL);
if(!e){
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
memset(e, '\0', sizeof(struct procfs_list_entry));
INIT_LIST_HEAD(&e->children);
strcpy(e->name, name);
if(parent)
parent_pde = parent->entry;
if (mode & S_IFDIR) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent_pde);
#else
pde = proc_mkdir_data(name, f_mode, parent_pde, e);
#endif
}
else if ((mode & S_IFLNK) == S_IFLNK) {
pde = proc_symlink(name, parent_pde, (char *)opaque);
}
else {
const struct file_operations *fop;
if(opaque)
fop = (const struct file_operations *)opaque;
else if(mode & S_IWUSR)
fop = &mckernel_forward;
else
fop = &mckernel_forward_ro;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, f_mode, parent_pde);
if(pde)
pde->proc_fops = fop;
#else
pde = proc_create_data(name, f_mode, parent_pde, fop, e);
if(pde)
proc_set_user(pde, uid, gid);
#endif
}
if(!pde){
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", name);
kfree(e);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->uid = uid;
pde->gid = gid;
pde->data = e;
#endif
if(parent)
e->osnum = parent->osnum;
e->entry = pde;
e->parent = parent;
list_add(&(e->list), parent? &(parent->children): &procfs_file_list);
return e;
}
static void
add_procfs_entries(struct procfs_list_entry *parent,
const struct procfs_entry *entries, kuid_t uid, kgid_t gid)
{
const struct procfs_entry *p;
for(p = entries; p->name; p++){
add_procfs_entry(parent, p->name, p->mode, uid, gid, p->fops);
}
}
static const struct cred *
get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
return __task_cred(task);
}
}
return NULL;
}
static struct procfs_list_entry *
find_base_entry(int osnum)
{
char name[12];
sprintf(name, "mcos%d", osnum);
return find_procfs_entry(NULL, name);
}
static struct procfs_list_entry *
find_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_base_entry(osnum)))
return NULL;
sprintf(name, "%d", pid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
find_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_pid_entry(osnum, pid)))
return NULL;
if(!(e = find_procfs_entry(e, "task")))
return NULL;
sprintf(name, "%d", tid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
get_base_entry(int osnum)
{
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
e = find_procfs_entry(NULL, name);
if(!e){
e = add_procfs_entry(NULL, name, S_IFDIR | 0555,
uid, gid, NULL);
e->osnum = osnum;
}
return e;
}
static struct procfs_list_entry *
get_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static struct procfs_list_entry *
get_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
if(!(parent = find_procfs_entry(parent, name)))
return NULL;
if(!(parent = find_procfs_entry(parent, "task")))
return NULL;
sprintf(name, "%d", tid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static void
_add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *exe;
parent = get_tid_entry(osnum, pid, tid);
if(parent){
add_procfs_entries(parent, tid_entry_stuff,
cred->uid, cred->gid);
exe = find_procfs_entry(parent->parent->parent, "exe");
if(exe){
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
cred->uid, cred->gid, exe->data);
}
}
}
void
add_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
delete_pid_entry(int osnum, int pid)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
struct procfs_list_entry *e;
e = add_procfs_entry(parent, "exe", S_IFLNK | 0777, uid, gid,
path);
e->data = kmalloc(strlen(path) + 1, GFP_KERNEL);
strcpy(e->data, path);
task = find_procfs_entry(parent, "task");
list_for_each_entry(parent, &task->children, list) {
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
uid, gid, path);
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
unsigned long irqflag;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void
procfs_exit(int osnum)
{
unsigned long irqflag;
struct procfs_list_entry *e;
irqflag = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
e = find_base_entry(osnum);
if(e)
delete_procfs_entries(e);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflag);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static loff_t
mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
static const struct file_operations mckernel_forward = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = mckernel_procfs_write,
};
static const struct procfs_entry tid_entry_stuff[] = {
// PROC_REG("auxv", S_IRUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("pagemap", S_IRUGO, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
// PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("coredump_filter", S_IRUGO|S_IWUSR, NULL),
PROC_REG("cpuset", S_IXUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
PROC_REG("pagemap", S_IRUGO, NULL),
PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
PROC_DIR("task", S_IRUGO|S_IXUGO),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry base_entry_stuff[] = {
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("cpuinfo", S_IRUGO, NULL),
// PROC_REG("meminfo", S_IRUGO, NULL),
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
// PROC_REG("softirq", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("uptime", S_IRUGO, NULL),
// PROC_REG("version", S_IRUGO, NULL),
// PROC_REG("vmallocinfo",S_IRUSR, NULL),
// PROC_REG("vmstat", S_IRUGO, NULL),
// PROC_REG("zoneinfo", S_IRUGO, NULL),
PROC_TERM
};

View File

@ -44,6 +44,7 @@
#include <asm/uaccess.h> #include <asm/uaccess.h>
#include <asm/delay.h> #include <asm/delay.h>
#include <asm/io.h> #include <asm/io.h>
#include "../../config.h"
#include "mcctrl.h" #include "mcctrl.h"
#include <linux/version.h> #include <linux/version.h>
@ -57,6 +58,17 @@
#define dprintk(...) #define dprintk(...)
#endif #endif
#ifdef MCCTRL_KSYM_zap_page_range
static void
(*mcctrl_zap_page_range)(struct vm_area_struct *vma, unsigned long start,
unsigned long size, struct zap_details *details)
#if MCCTRL_KSYM_zap_page_range
= (void *)MCCTRL_KSYM_zap_page_range;
#else
= &zap_page_range;
#endif
#endif
static long pager_call(ihk_os_t os, struct syscall_request *req); static long pager_call(ihk_os_t os, struct syscall_request *req);
#ifdef SC_DEBUG #ifdef SC_DEBUG
@ -83,6 +95,13 @@ int init_peer_channel_registry(struct mcctrl_usrdata *ud)
return 0; return 0;
} }
void destroy_peer_channel_registry(struct mcctrl_usrdata *ud)
{
kfree(ud->keys);
ud->keys = NULL;
return;
}
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch) int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch)
{ {
int cpu; int cpu;
@ -257,14 +276,14 @@ retry_alloc:
} }
/* Prepare per-process wait queue head */ /* Prepare per-process wait queue head */
wqhln->pid = current->tgid; wqhln->pid = task_tgid_vnr(current);
wqhln->req = 0; wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall); init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock); irqflags = ihk_ikc_spinlock_lock(&channel->wq_list_lock);
/* First see if there is a wait queue already */ /* First see if there is a wait queue already */
list_for_each_entry(wqhln_iter, &channel->wq_list, list) { list_for_each_entry(wqhln_iter, &channel->wq_list, list) {
if (wqhln_iter->pid == current->tgid) { if (wqhln_iter->pid == task_tgid_vnr(current)) {
kfree(wqhln); kfree(wqhln);
wqhln = wqhln_iter; wqhln = wqhln_iter;
list_del(&wqhln->list); list_del(&wqhln->list);
@ -463,7 +482,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) { list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) {
if (ppd_iter->pid == current->tgid) { if (ppd_iter->pid == task_tgid_vnr(current)) {
ppd = ppd_iter; ppd = ppd_iter;
break; break;
} }
@ -471,7 +490,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
if (!ppd) { if (!ppd) {
printk("ERROR: no per process data for pid %d\n", current->tgid); printk("ERROR: no per process data for pid %d\n", task_tgid_vnr(current));
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
@ -564,12 +583,10 @@ static struct file_operations rus_fops = {
.mmap = &rus_mmap, .mmap = &rus_mmap,
}; };
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp) unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end)
{ {
struct file *file; struct file *file;
struct vm_area_struct *vma;
unsigned long start;
unsigned long end;
struct cred *promoted; struct cred *promoted;
const struct cred *original; const struct cred *original;
@ -590,38 +607,22 @@ int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, un
cap_raise(promoted->cap_effective, CAP_SYS_RAWIO); cap_raise(promoted->cap_effective, CAP_SYS_RAWIO);
original = override_creds(promoted); original = override_creds(promoted);
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) #if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
start = do_mmap_pgoff(file, 0, end, start = vm_mmap_pgoff(file, start, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0); PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
#endif #else
start = vm_mmap(file, start, end,
up_write(&current->mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
start = vm_mmap(file, 0, end,
PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0); PROT_READ|PROT_WRITE, MAP_FIXED|MAP_SHARED, 0);
#endif #endif
revert_creds(original); revert_creds(original);
put_cred(promoted); put_cred(promoted);
fput(file); fput(file);
if (IS_ERR_VALUE(start)) { if (IS_ERR_VALUE(start)) {
printk("mcctrl:user space reservation failed.\n"); printk("mcctrl:user space reservation failed.\n");
return start;
} }
*startp = start; return start;
*endp = end;
return 0;
} }
//unsigned long last_thread_exec = 0; //unsigned long last_thread_exec = 0;
@ -901,7 +902,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
dprintk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error); dprintk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
goto out; goto out;
} }
if (!S_ISREG(st.mode)) { if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
/* treat memory devices as regular files */
}
else if (!S_ISREG(st.mode)) {
error = -ESRCH; error = -ESRCH;
dprintk("pager_req_create(%d,%lx):not VREG. %x\n", fd, (long)result_pa, st.mode); dprintk("pager_req_create(%d,%lx):not VREG. %x\n", fd, (long)result_pa, st.mode);
goto out; goto out;
@ -1509,6 +1513,10 @@ static int clear_pte_range(uintptr_t start, uintptr_t len)
} }
if (addr < end) { if (addr < end) {
error = zap_vma_ptes(vma, addr, end-addr); error = zap_vma_ptes(vma, addr, end-addr);
if (error) {
mcctrl_zap_page_range(vma, addr, end-addr, NULL);
error = 0;
}
if (ret == 0) { if (ret == 0) {
ret = error; ret = error;
} }
@ -1645,7 +1653,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
goto out; goto out;
} }
ppd->pid = current->tgid; ppd->pid = task_tgid_vnr(current);
ppd->rpgtable = sc->args[2]; ppd->rpgtable = sc->args[2];
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
@ -1656,12 +1664,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
ppd->pid, ppd->rpgtable); ppd->pid, ppd->rpgtable);
} }
error = clear_pte_range(sc->args[0], sc->args[1]); ret = clear_pte_range(sc->args[0], sc->args[1]);
if (error) {
error = -ENOSYS;
goto out;
}
ret = 0;
break; break;
case __NR_mprotect: case __NR_mprotect:
@ -1676,7 +1679,7 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock); flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) { list_for_each_entry(ppd_iter, &usrdata->per_proc_list, list) {
if (ppd_iter->pid == current->tgid) { if (ppd_iter->pid == task_tgid_vnr(current)) {
ppd = ppd_iter; ppd = ppd_iter;
break; break;
} }
@ -1686,13 +1689,13 @@ int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall
list_del(&ppd->list); list_del(&ppd->list);
dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n", dprintk("pid: %d, tid: %d: rpgtable for %d (0x%lx) removed\n",
current->tgid, current->pid, ppd->pid, ppd->rpgtable); task_tgid_vnr(current), current->pid, ppd->pid, ppd->rpgtable);
kfree(ppd); kfree(ppd);
} }
else { else {
printk("WARNING: no per process data for pid %d ?\n", printk("WARNING: no per process data for pid %d ?\n",
current->tgid); task_tgid_vnr(current));
} }
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags); ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCCTRL_SYSFS_H
#define MCCTRL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfsm_ops {
ssize_t (*show)(struct sysfsm_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfsm_ops *ops, void *instance,
const void *buf, size_t bufsize);
void (*release)(struct sysfsm_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfsm_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfsm_createf(ihk_os_t os, struct sysfsm_ops *ops, void *instance,
int mode, const char *fmt, ...);
extern int sysfsm_mkdirf(ihk_os_t os, sysfs_handle_t *dirhp,
const char *fmt, ...);
extern int sysfsm_symlinkf(ihk_os_t os, sysfs_handle_t targeth,
const char *fmt, ...);
extern int sysfsm_lookupf(ihk_os_t os, sysfs_handle_t *objhp,
const char *fmt, ...);
extern int sysfsm_unlinkf(ihk_os_t os, int flags, const char *fmt, ...);
extern void sysfsm_cleanup(ihk_os_t os);
extern void sysfsm_packet_handler(void *os, int msg, int err, long arg1,
long arg2);
#endif /* MCCTRL_SYSFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -0,0 +1,39 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR=@KMODDIR@
src = @abs_srcdir@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE}" ]; then echo ""; else echo ${RHEL_RELEASE}; fi)
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ENABLE_BUILD=$(shell if ( [ ${LINUX_VERSION_CODE} -ge 262144 ] && [ ${LINUX_VERSION_CODE} -lt 262400 ] ); then echo "yes"; else echo "no"; fi)
else
ENABLE_BUILD=no
endif
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
ifeq ($(ENABLE_BUILD),yes)
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
endif
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
ifeq ($(ENABLE_BUILD),yes)
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)
endif

View File

@ -0,0 +1,416 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,951 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,438 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,557 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,488 +0,0 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include "mcctrl.h"
#include <linux/version.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
ihk_os_t os;
int osnum;
int pid;
int cpu;
char fname[PROCFS_NAME_MAX];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
loff_t mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
static const struct file_operations mckernel_procfs_file_operations = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
/**
* \brief Return specified procfs entry.
*
* \param p a name of the procfs file
* \param osnum os number
* \param mode if zero create a directory otherwise a file
*
* return value: NULL: Something wrong has occurred.
* otherwise: address of the proc_dir_entry structure of the procfs file
*
* p should not be NULL nor terminated by "/".
*
* We create a procfs entry if there is not already one.
* This process is recursive to the root of the procfs tree.
*/
/*
* XXX: Two or more entries which have same name can be created.
*
* get_procfs_list_entry() avoids creating an entry which has already been created.
* But, it allows creating an entry which is being created by another thread.
*
* This problem occurred when two requests which created files with a common
* ancestor directory which was not explicitly created were racing.
*/
static struct procfs_list_entry *get_procfs_list_entry(char *p, int osnum, int mode)
{
char *r;
struct proc_dir_entry *pde = NULL;
struct procfs_list_entry *e, *ret = NULL, *parent = NULL;
char name[PROCFS_NAME_MAX];
unsigned long irqflags;
dprintk("get_procfs_list_entry: %s for osnum %d mode %o\n", p, osnum, mode);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if (e == NULL) {
kprintf("ERROR: The procfs_file_list has a null entry.\n");
return NULL;
}
if (strncmp(e->fname, p, PROCFS_NAME_MAX) == 0) {
/* We found the entry */
ret = e;
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
if (ret != NULL) {
return ret;
}
r = strrchr(p, '/');
if (r != NULL) {
/* We have non-null parent dir. */
strncpy(name, p, r - p);
name[r - p] = '\0';
parent = get_procfs_list_entry(name, osnum, 0);
if (parent == NULL) {
/* We counld not get a parent procfs entry. Give up.*/
return NULL;
}
}
ret = kmalloc(sizeof(struct procfs_list_entry), GFP_KERNEL);
if (ret == NULL) {
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
/* Fill the fname field of the entry */
strncpy(ret->fname, p, PROCFS_NAME_MAX);
if (r != NULL) {
strncpy(name, r + 1, p + PROCFS_NAME_MAX - r - 1);
} else {
strncpy(name, p, PROCFS_NAME_MAX);
}
if (mode == 0) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent ? parent->entry : NULL);
#else
pde = proc_mkdir_data(name, 0555, parent ? parent->entry : NULL, ret);
#endif
} else {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, mode, parent->entry);
if (pde)
pde->proc_fops = &mckernel_procfs_file_operations;
#else
pde = proc_create_data(name, mode, parent->entry,
&mckernel_procfs_file_operations, ret);
#endif
}
if (pde == NULL) {
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", p);
kfree(ret);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->data = ret;
#endif
ret->osnum = osnum;
ret->entry = pde;
ret->parent = parent;
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_add(&(ret->list), &procfs_file_list);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
dprintk("get_procfs_list_entry: %s done\n", p);
return ret;
}
/**
* \brief Create a procfs entry.
*
* \param __os (opeque) os variable
* \param ref cpuid of the requesting mckernel process
* \param osnum osnum of the requesting mckernel process
* \param pid pid of the requesting mckernel process
* \param arg sent argument
*/
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg)
{
struct procfs_list_entry *e;
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
int mode;
char name[PROCFS_NAME_MAX];
dprintk("procfs_create: osnum: %d, cpu: %d, pid: %d\n", osnum, ref, pid);
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("name: %s mode: %o\n", f->fname, f->mode);
strncpy(name, f->fname, PROCFS_NAME_MAX);
mode = f->mode;
if (name[PROCFS_NAME_MAX - 1] != '\0') {
printk("ERROR: procfs_creat: file name not properly terminated.\n");
goto quit;
}
e = get_procfs_list_entry(name, osnum, mode);
if (e == NULL) {
printk("ERROR: could not create a procfs entry for %s.\n", name);
goto quit;
}
e->os = __os;
e->cpu = ref;
e->pid = pid;
quit:
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_create: done\n");
}
/**
* \brief Delete a procfs entry.
*
* \param __os (opaque) os variable
* \param osnum os number
* \param arg sent argument
*/
void procfs_delete(void *__os, int osnum, unsigned long arg)
{
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
struct procfs_list_entry *e;
struct procfs_list_entry *parent = NULL;
char name[PROCFS_NAME_MAX];
char *r;
unsigned long irqflags;
dprintk("procfs_delete: \n");
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("fname: %s.\n", f->fname);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if ((strncmp(e->fname, f->fname, PROCFS_NAME_MAX) == 0) &&
(e->osnum == osnum)) {
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
kfree(e);
r = strrchr(f->fname, '/');
if (r == NULL) {
strncpy(name, f->fname, PROCFS_NAME_MAX);
} else {
strncpy(name, r + 1, PROCFS_NAME_MAX);
}
dprintk("found and remove %s from the list.\n", name);
remove_proc_entry(name, parent->entry);
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_delete: done\n");
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_path.dentry->d_inode;
char *kern_buffer;
int order = 0;
volatile struct procfs_read *r;
struct ikc_scd_packet isp;
int ret, retrycount = 0;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
e->fname, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
return -ENOMEM;
}
retry:
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
strncpy((char *)r->fname, e->fname, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = e->cpu;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(e->os, e->cpu, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if ((r->ret == 0) && (r->eof != 1)) {
/* A miss-hit caused by migration has occurred.
* We simply retry the query with a new CPU.
*/
if (retrycount++ > 10) {
kprintf("ERROR: mckernel_procfs_read: excessive retry.\n");
goto out;
}
e->cpu = r->newcpu;
dprintk("retry\n");
goto retry;
}
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
free_pages((uintptr_t)kern_buffer, order);
kfree((void *)r);
return ret;
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void procfs_init(int osnum) {
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void procfs_exit(int osnum) {
char buf[20], *r;
int error;
mm_segment_t old_fs = get_fs();
struct kstat stat;
struct procfs_list_entry *parent;
struct procfs_list_entry *e, *temp = NULL;
unsigned long irqflags;
dprintk("remove remaining mckernel procfs files.\n");
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry_safe(e, temp, &procfs_file_list, list) {
if (e->osnum == osnum) {
dprintk("found entry for %s.\n", e->fname);
list_del(&e->list);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
parent = e->parent;
r = strrchr(e->fname, '/');
if (r == NULL) {
r = e->fname;
} else {
r += 1;
}
if (parent) {
remove_proc_entry(r, parent->entry);
}
dprintk("free the entry\n");
kfree(e);
}
dprintk("iterate it.\n");
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
sprintf(buf, "/proc/mcos%d", osnum);
set_fs(KERNEL_DS);
error = vfs_stat (buf, &stat);
set_fs(old_fs);
if (error != 0) {
return;
}
printk("procfs_exit: We have to remove unexpectedly remaining %s.\n", buf);
/* remove remnant of previous mcos%d */
remove_proc_entry(buf + 6, NULL);
}

View File

@ -1,13 +1,19 @@
CC=@CC@ CC=@CC@
BINDIR=@BINDIR@ BINDIR=@BINDIR@
CFLAGS=-Wall -O -fPIE -pie KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@ VPATH=@abs_srcdir@
TARGET=mcexec TARGET=mcexec
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
all: $(TARGET) all: $(TARGET)
mcexec: mcexec.c mcexec: mcexec.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -pthread -o $@ $^ $(EXTRA_OBJS) $(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
clean: clean:
$(RM) $(TARGET) *.o $(RM) $(TARGET) *.o
@ -17,4 +23,5 @@ clean:
install: install:
mkdir -p -m 755 $(BINDIR) mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR) install -m 755 mcexec $(BINDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

966
executer/user/eclair.c Normal file
View File

@ -0,0 +1,966 @@
/**
* \file eclair.c
* License details are found in the file LICENSE.
* \brief
* IHK os memory dump analyzer for McKernel
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
#include <bfd.h>
#include <fcntl.h>
#include <inttypes.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#define CPU_TID_BASE 1000000
struct options {
uint8_t cpu;
uint8_t help;
char *kernel_path;
char *dump_path;
char *log_path;
}; /* struct options */
struct thread_info {
struct thread_info *next;
int status;
#define PS_RUNNING 0x01
#define PS_INTERRUPTIBLE 0x02
#define PS_UNINTERRUPTIBLE 0x04
#define PS_STOPPED 0x20
#define PS_TRACED 0x40
#define CS_IDLE 0x010000
#define CS_RUNNING 0x020000
#define CS_RESERVED 0x030000
int pid;
int tid;
int cpu;
int lcpu;
int padding;
uintptr_t process;
uintptr_t clv;
uintptr_t x86_clv;
}; /* struct thread_info */
static struct options opt;
static volatile int f_done = 0;
static bfd *symbfd = NULL;
static bfd *dumpbfd = NULL;
static asection *dumpscn = NULL;
static int num_processors = -1;
static asymbol **symtab = NULL;
static ssize_t nsyms;
static uintptr_t kernel_base;
static struct thread_info *tihead = NULL;
static struct thread_info **titailp = &tihead;
static struct thread_info *curr_thread = NULL;
static uintptr_t ihk_mc_switch_context = -1;
static uintptr_t lookup_symbol(char *name) {
int i;
for (i = 0; i < nsyms; ++i) {
if (!strcmp(symtab[i]->name, name)) {
return (symtab[i]->section->vma + symtab[i]->value);
}
}
#define NOSYMBOL ((uintptr_t)-1)
return NOSYMBOL;
} /* lookup_symbol() */
static uintptr_t virt_to_phys(uintptr_t va) {
#define MAP_KERNEL 0xFFFFFFFF80000000
if (va >= MAP_KERNEL) {
return (va - MAP_KERNEL + kernel_base);
}
#define MAP_ST 0xFFFF800000000000
if (va >= MAP_ST) {
return (va - MAP_ST);
}
if (0) printf("virt_to_phys(%lx): -1\n", va);
#define NOPHYS ((uintptr_t)-1)
return NOPHYS;
} /* virt_to_phys() */
static int read_physmem(uintptr_t pa, void *buf, size_t size) {
off_t off;
bfd_boolean ok;
if (pa < dumpscn->vma) {
printf("read_physmem(%lx,%p,%lx):too small pa. vma %lx\n", pa, buf, size, dumpscn->vma);
return 1;
}
off = pa - dumpscn->vma;
if (off >= dumpscn->size) {
printf("read_physmem(%lx,%p,%lx):too large pa. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
if ((dumpscn->size - off) < size) {
printf("read_physmem(%lx,%p,%lx):too large size. vma %lx size %lx\n", pa, buf, size, dumpscn->vma, dumpscn->size);
return 1;
}
ok = bfd_get_section_contents(dumpbfd, dumpscn, buf, off, size);
if (!ok) {
bfd_perror("read_physmem:bfd_get_section_contents");
return 1;
}
return 0;
} /* read_physmem() */
static int read_mem(uintptr_t va, void *buf, size_t size) {
uintptr_t pa;
int error;
pa = virt_to_phys(va);
if (pa == NOPHYS) {
if (0) {
/* NOPHYS is usual for 'bt' command */
perror("read_mem:virt_to_phys");
}
return 1;
}
error = read_physmem(pa, buf, size);
if (error) {
perror("read_mem:read_physmem");
return 1;
}
return 0;
} /* read_mem() */
static int read_64(uintptr_t va, void *buf) {
return read_mem(va, buf, sizeof(uint64_t));
} /* read_64() */
static int read_32(uintptr_t va, void *buf) {
return read_mem(va, buf, sizeof(uint32_t));
} /* read_32() */
static int read_symbol_64(char *name, void *buf) {
uintptr_t va;
int error;
va = lookup_symbol(name);
if (va == NOSYMBOL) {
printf("read_symbol_64(%s):lookup_symbol failed\n", name);
return 1;
}
error = read_64(va, buf);
if (error) {
printf("read_symbol_64(%s):read_64(%#lx) failed", name, va);
return 1;
}
return 0;
} /* read_symbol_64() */
enum {
/* cpu_local_var */
CPU_LOCAL_VAR_SIZE = 0,
CURRENT_OFFSET,
RUNQ_OFFSET,
CPU_STATUS_OFFSET,
/* process */
CTX_OFFSET,
SCHED_LIST_OFFSET,
PROC_OFFSET,
/* fork_tree_node */
STATUS_OFFSET,
PID_OFFSET,
TID_OFFSET,
END_MARK,
}; /* enum */
static uintptr_t debug_constants[END_MARK+1];
#define K(name) (debug_constants[name])
static int setup_constants(void) {
int error;
uintptr_t va;
va = lookup_symbol("debug_constants");
if (va == NOSYMBOL) {
perror("debug_constants");
return 1;
}
error = read_mem(va, debug_constants, sizeof(debug_constants));
if (error) {
perror("debug_constants");
return 1;
}
if (0) {
printf("CPU_LOCAL_VAR_SIZE: %ld\n", K(CPU_LOCAL_VAR_SIZE));
printf("CURRENT_OFFSET: %ld\n", K(CURRENT_OFFSET));
printf("RUNQ_OFFSET: %ld\n", K(RUNQ_OFFSET));
printf("CPU_STATUS_OFFSET: %ld\n", K(CPU_STATUS_OFFSET));
printf("CTX_OFFSET: %ld\n", K(CTX_OFFSET));
printf("SCHED_LIST_OFFSET: %ld\n", K(SCHED_LIST_OFFSET));
printf("PROC_OFFSET: %ld\n", K(PROC_OFFSET));
printf("STATUS_OFFSET: %ld\n", K(STATUS_OFFSET));
printf("PID_OFFSET: %ld\n", K(PID_OFFSET));
printf("TID_OFFSET: %ld\n", K(TID_OFFSET));
printf("END_MARK: %ld\n", K(END_MARK));
}
return 0;
} /* setup_constants() */
static int setup_threads(void) {
int error;
uintptr_t clv;
int cpu;
uintptr_t current;
uintptr_t locals;
size_t locals_span;
error = read_symbol_64("num_processors", &num_processors);
if (error) {
perror("num_processors");
return 1;
}
error = read_symbol_64("locals", &locals);
if (error) {
perror("locals");
return 1;
}
error = read_symbol_64("x86_cpu_local_variables_span", &locals_span);
if (error) {
locals_span = 4096;
}
if (0) printf("locals 0x%lx span 0x%lx\n", locals, locals_span);
error = read_symbol_64("clv", &clv);
if (error) {
perror("clv");
return 1;
}
ihk_mc_switch_context = lookup_symbol("ihk_mc_switch_context");
if (0) printf("ihk_mc_switch_context: %lx\n", ihk_mc_switch_context);
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
uintptr_t head;
uintptr_t entry;
v = clv + (cpu * K(CPU_LOCAL_VAR_SIZE));
error = read_64(v+K(CURRENT_OFFSET), &current);
if (error) {
perror("current");
return 1;
}
head = v + K(RUNQ_OFFSET);
error = read_64(head, &entry);
if (error) {
perror("runq head");
return 1;
}
while (entry != head) {
uintptr_t thread;
uintptr_t proc;
int pid;
int tid;
struct thread_info *ti;
int status;
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
thread = entry - K(SCHED_LIST_OFFSET);
error = read_64(thread+K(PROC_OFFSET), &proc);
if (error) {
perror("proc");
return 1;
}
error = read_32(thread+K(STATUS_OFFSET), &status);
if (error) {
perror("status");
return 1;
}
error = read_32(proc+K(PID_OFFSET), &pid);
if (error) {
perror("pid");
return 1;
}
error = read_32(thread+K(TID_OFFSET), &tid);
if (error) {
perror("tid");
return 1;
}
ti->next = NULL;
ti->status = status;
ti->pid = pid;
ti->tid = tid;
ti->cpu = (thread == current)? cpu: -1;
ti->lcpu = cpu;
ti->process = thread;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
error = read_64(entry, &entry);
if (error) {
perror("process2");
return 1;
}
}
}
if (!tihead) {
printf("thread not found. cpu mode forcibly\n");
opt.cpu = 1;
}
if (opt.cpu) {
for (cpu = 0; cpu < num_processors; ++cpu) {
uintptr_t v;
struct thread_info *ti;
int status;
uintptr_t current;
v = clv + K(CPU_LOCAL_VAR_SIZE)*cpu;
error = read_32(v+K(CPU_STATUS_OFFSET), &status);
if (error) {
perror("cpu.status");
return 1;
}
if (!status) {
continue;
}
error = read_64(v+K(CURRENT_OFFSET), &current);
if (error) {
perror("current");
return 1;
}
ti = malloc(sizeof(*ti));
if (!ti) {
perror("malloc");
return 1;
}
ti->next = NULL;
ti->status = status << 16;
ti->pid = CPU_TID_BASE + cpu;
ti->tid = CPU_TID_BASE + cpu;
ti->cpu = cpu;
ti->process = current;
ti->clv = v;
ti->x86_clv = locals + locals_span*cpu;
*titailp = ti;
titailp = &ti->next;
}
}
if (!tihead) {
printf("thread not found\n");
return 1;
}
curr_thread = tihead;
return 0;
} /* setup_threads() */
static int setup_symbols(char *fname) {
ssize_t needs;
bfd_boolean ok;
symbfd = bfd_openr(fname, "elf64-x86-64");
if (!symbfd) {
bfd_perror("bfd_openr");
return 1;
}
ok = bfd_check_format(symbfd, bfd_object);
if (!ok) {
bfd_perror("bfd_check_format");
return 1;
}
needs = bfd_get_symtab_upper_bound(symbfd);
if (needs < 0) {
bfd_perror("bfd_get_symtab_upper_bound");
return 1;
}
if (!needs) {
printf("no symbols\n");
return 1;
}
symtab = malloc(needs);
if (!symtab) {
perror("malloc");
return 1;
}
nsyms = bfd_canonicalize_symtab(symbfd, symtab);
if (nsyms < 0) {
bfd_perror("bfd_canonicalize_symtab");
return 1;
}
return 0;
} /* setup_symbols() */
static int setup_dump(char *fname) {
bfd_boolean ok;
dumpbfd = bfd_fopen(opt.dump_path, "elf64-x86-64", "r", -1);
if (!dumpbfd) {
bfd_perror("bfd_fopen");
return 1;
}
ok = bfd_check_format(dumpbfd, bfd_object);
if (!ok) {
bfd_perror("bfd_check_format");
return 1;
}
dumpscn = bfd_get_section_by_name(dumpbfd, "physmem");
if (!dumpscn) {
bfd_perror("bfd_get_section_by_name");
return 1;
}
kernel_base = dumpscn->vma + 0x200000;
return 0;
} /* setup_dump() */
static ssize_t print_hex(char *buf, char *str) {
char *p;
char *q;
q = buf;
for (p = str; *p != '\0'; ++p) {
q += sprintf(q, "%02x", *p);
}
*q = '\0';
return (q - buf);
} /* print_hex() */
static ssize_t print_bin(char *buf, void *data, size_t size) {
uint8_t *p;
char *q;
int i;
p = data;
q = buf;
for (i = 0; i < size; ++i) {
q += sprintf(q, "%02x", *p);
++p;
}
*q = '\0';
return (q - buf);
} /* print_bin() */
static void command(char *cmd, char *res) {
char *p;
char *rbp;
p = cmd;
rbp = res;
do {
if (!strncmp(p, "qSupported", 10)) {
rbp += sprintf(rbp, "PacketSize=1024");
rbp += sprintf(rbp, ";qXfer:features:read+");
}
else if (!strncmp(p, "Hg", 2)) {
int n;
int tid;
struct thread_info *ti;
p += 2;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'Hg' cmd: \"%s\"\n", p);
break;
}
if (tid) {
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
curr_thread = ti;
}
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "Hc-1")) {
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "?")) {
rbp += sprintf(rbp, "S02");
}
else if (!strcmp(p, "qC")) {
rbp += sprintf(rbp, "QC%x", curr_thread->tid);
}
else if (!strcmp(p, "qAttached")) {
rbp += sprintf(rbp, "1");
}
else if (!strncmp(p, "qXfer:features:read:target.xml:", 31)) {
char *str =
"<target version=\"1.0\">"
"<architecture>i386:x86-64</architecture>"
"</target>";
rbp += sprintf(rbp, "l");
if (0)
rbp += print_hex(rbp, str);
rbp += sprintf(rbp, "%s", str);
}
else if (!strcmp(p, "D")) {
rbp += sprintf(rbp, "OK");
f_done = 1;
}
else if (!strcmp(p, "g")) {
if (curr_thread->cpu < 0) {
struct x86_kregs {
uintptr_t rsp, rbp, rbx, rsi;
uintptr_t rdi, r12, r13, r14;
uintptr_t r15, rflags, rsp0;
};
int error;
struct x86_kregs kregs;
error = read_mem(curr_thread->process+K(CTX_OFFSET),
&kregs, sizeof(kregs));
if (error) {
perror("read_mem");
break;
}
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rax */
rbp += print_bin(rbp, &kregs.rbx, sizeof(uint64_t));
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rcx */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* rdx */
rbp += print_bin(rbp, &kregs.rsi, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rdi, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rbp, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.rsp, sizeof(uint64_t));
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r8 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r9 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r10 */
rbp += sprintf(rbp, "xxxxxxxxxxxxxxxx"); /* r11 */
rbp += print_bin(rbp, &kregs.r12, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r13, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r14, sizeof(uint64_t));
rbp += print_bin(rbp, &kregs.r15, sizeof(uint64_t));
rbp += print_bin(rbp, &ihk_mc_switch_context,
sizeof(uint64_t)); /* rip */
rbp += print_bin(rbp, &kregs.rflags, sizeof(uint32_t));
rbp += sprintf(rbp, "xxxxxxxx"); /* cs */
rbp += sprintf(rbp, "xxxxxxxx"); /* ss */
rbp += sprintf(rbp, "xxxxxxxx"); /* ds */
rbp += sprintf(rbp, "xxxxxxxx"); /* es */
rbp += sprintf(rbp, "xxxxxxxx"); /* fs */
rbp += sprintf(rbp, "xxxxxxxx"); /* gs */
}
else {
int error;
uintptr_t regs[21];
uint8_t *pu8;
int i;
error = read_mem(curr_thread->x86_clv+240,
&regs, sizeof(regs));
if (error) {
perror("read_mem");
break;
}
pu8 = (void *)&regs;
for (i = 0; i < sizeof(regs)-4; ++i) {
rbp += sprintf(rbp, "%02x", pu8[i]);
}
}
}
else if (!strcmp(p, "mffffffff80018a82,1")) {
rbp += sprintf(rbp, "b8");
}
else if (!strcmp(p, "mffffffff80018a82,9")) {
rbp += sprintf(rbp, "b8f2ffffff41564155");
}
else if (!strncmp(p, "m", 1)) {
int n;
uintptr_t start;
size_t size;
uintptr_t addr;
int error;
uint8_t u8;
++p;
n = sscanf(p, "%lx,%lx", &start, &size);
if (n != 2) {
break;
}
for (addr = start; addr < (start + size); ++addr) {
error = read_mem(addr, &u8, sizeof(u8));
if (error) {
u8 = 0xE5;
}
rbp += sprintf(rbp, "%02x", u8);
}
}
else if (!strcmp(p, "qTStatus")) {
rbp += sprintf(rbp, "T0;tnotrun:0");
}
else if (!strncmp(p, "qXfer:memory-map:read::", 23)) {
char *str =
"<memory-map>"
"<memory type=\"rom\" start=\"0xffffffff80001000\" length=\"0x27000\"/>"
"</memory-map>";
rbp += sprintf(rbp, "l");
if (0)
rbp += print_hex(rbp, str);
rbp += sprintf(rbp, "%s", str);
}
else if (!strncmp(p, "T", 1)) {
int n;
int tid;
struct thread_info *ti;
p += 1;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'T' cmd: \"%s\"\n", p);
break;
}
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
rbp += sprintf(rbp, "OK");
}
else if (!strcmp(p, "qfThreadInfo")) {
struct thread_info *ti;
for (ti = tihead; ti; ti = ti->next) {
if (ti == tihead) {
rbp += sprintf(rbp, "m%x", ti->tid);
}
else {
rbp += sprintf(rbp, ",%x", ti->tid);
}
}
}
else if (!strcmp(p, "qsThreadInfo")) {
rbp += sprintf(rbp, "l");
}
else if (!strncmp(p, "qThreadExtraInfo,", 17)) {
int n;
int tid;
struct thread_info *ti;
char buf[64];
char *q;
p += 17;
n = sscanf(p, "%x", &tid);
if (n != 1) {
printf("cannot parse 'qThreadExtraInfo' cmd: \"%s\"\n", p);
break;
}
for (ti = tihead; ti; ti = ti->next) {
if (ti->tid == tid) {
break;
}
}
if (!ti) {
printf("invalid tid %#x\n", tid);
break;
}
q = buf;
if (ti->status & PS_RUNNING) {
q += sprintf(q, "running on cpu%d", ti->cpu);
}
else if (ti->status & (PS_INTERRUPTIBLE | PS_UNINTERRUPTIBLE)) {
q += sprintf(q, "waiting on cpu%d", ti->lcpu);
}
else if (ti->status & PS_STOPPED) {
q += sprintf(q, "stopped on cpu%d", ti->lcpu);
}
else if (ti->status & PS_TRACED) {
q += sprintf(q, "traced on cpu%d", ti->lcpu);
}
else if (ti->status == CS_IDLE) {
q += sprintf(q, "cpu%d idle", ti->cpu);
}
else if (ti->status == CS_RUNNING) {
q += sprintf(q, "cpu%d running", ti->cpu);
}
else if (ti->status == CS_RESERVED) {
q += sprintf(q, "cpu%d reserved", ti->cpu);
}
else {
q += sprintf(q, "status=%#x", ti->status);
}
if (ti->tid != ti->pid) {
q += sprintf(q, ",pid=%d", ti->pid);
}
rbp += print_hex(rbp, buf);
}
} while (0);
*rbp = '\0';
return;
} /* command() */
static void options(int argc, char *argv[]) {
memset(&opt, 0, sizeof(opt));
opt.kernel_path = "./mckernel.img";
opt.dump_path = "./mcdump";
for (;;) {
int c;
c = getopt(argc, argv, "cd:hk:");
if (c < 0) {
break;
}
switch (c) {
case 'h':
case '?':
opt.help = 1;
break;
case 'c':
opt.cpu = 1;
break;
case 'k':
opt.kernel_path = optarg;
break;
case 'd':
opt.dump_path = optarg;
break;
}
}
if (optind < argc) {
opt.help = 1;
}
return;
} /* options() */
static int sock = -1;
static FILE *ifp = NULL;
static FILE *ofp = NULL;
static int start_gdb(void) {
struct sockaddr_in sin;
socklen_t slen;
int error;
pid_t pid;
int ss;
sock = socket(PF_INET, SOCK_STREAM, 0);
if (sock < 0) {
perror("socket");
return 1;
}
error = listen(sock, SOMAXCONN);
if (error) {
perror("listen");
return 1;
}
slen = sizeof(sin);
error = getsockname(sock, (struct sockaddr *)&sin, &slen);
if (error) {
perror("getsockname");
return 1;
}
pid = fork();
if (pid == (pid_t)-1) {
perror("fork");
return 1;
}
if (!pid) {
char buf[32];
sprintf(buf, "target remote :%d", ntohs(sin.sin_port));
execlp("gdb", "eclair", "-q", "-ex", "set prompt (eclair) ",
"-ex", buf, opt.kernel_path, NULL);
perror("execlp");
return 3;
}
ss = accept(sock, NULL, NULL);
if (ss < 0) {
perror("accept");
return 1;
}
ifp = fdopen(ss, "r");
if (!ifp) {
perror("fdopen(r)");
return 1;
}
ofp = fdopen(ss, "r+");
if (!ofp) {
perror("fdopen(r+)");
return 1;
}
return 0;
} /* start_gdb() */
static void print_usage(void) {
fprintf(stderr, "usage: eclair [-ch] [-d <mcdump>] [-k <kernel.img>]\n");
return;
} /* print_usage() */
int main(int argc, char *argv[]) {
int c;
int error;
int mode;
uint8_t sum;
uint8_t check;
static char lbuf[1024];
static char rbuf[1024];
static char cbuf[3];
char *lbp;
char *p;
printf("eclair 0.20160314\n");
options(argc, argv);
if (opt.help) {
print_usage();
return 2;
}
error = setup_symbols(opt.kernel_path);
if (error) {
perror("setup_symbols");
print_usage();
return 1;
}
error = setup_dump(opt.dump_path);
if (error) {
perror("setup_dump");
print_usage();
return 1;
}
error = setup_constants();
if (error) {
perror("setup_constants");
return 1;
}
error = setup_threads();
if (error) {
perror("setup_threads");
return 1;
}
error = start_gdb();
if (error) {
perror("start_gdb");
return 1;
}
mode = 0;
sum = 0;
lbp = NULL;
while (!f_done) {
c = fgetc(ifp);
if (c < 0) {
break;
}
if (mode == 0) {
if (c == '$') {
mode = 1;
sum = 0;
lbp = lbuf;
continue;
}
}
if (mode == 1) {
if (c == '#') {
mode = 2;
*lbp = '\0';
continue;
}
sum += c;
*lbp++ = c;
}
if (mode == 2) {
cbuf[0] = c;
mode = 3;
continue;
}
if (mode == 3) {
cbuf[1] = c;
cbuf[2] = '\0';
check = strtol(cbuf, NULL, 16);
if (check != sum) {
mode = 0;
fputc('-', ofp);
continue;
}
mode = 0;
fputc('+', ofp);
command(lbuf, rbuf);
sum = 0;
for (p = rbuf; *p != '\0'; ++p) {
sum += *p;
}
fprintf(ofp, "$%s#%02x", rbuf, sum);
fflush(ofp);
continue;
}
}
return 0;
} /* main() */

View File

@ -59,7 +59,12 @@
#include <semaphore.h> #include <semaphore.h>
#include <signal.h> #include <signal.h>
#include <sys/signalfd.h> #include <sys/signalfd.h>
#include <sys/mount.h>
#include <include/generated/uapi/linux/version.h>
#include <sys/user.h>
#include "../include/uprotocol.h" #include "../include/uprotocol.h"
#include <getopt.h>
#include "../config.h"
//#define DEBUG //#define DEBUG
@ -129,6 +134,7 @@ static char *exec_path = NULL;
static char *altroot; static char *altroot;
static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK"; static const char rlimit_stack_envname[] = "MCKERNEL_RLIMIT_STACK";
static int ischild; static int ischild;
static int enable_vdso = 1;
struct fork_sync { struct fork_sync {
pid_t pid; pid_t pid;
@ -183,6 +189,8 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
desc = malloc(sizeof(struct program_load_desc) desc = malloc(sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs); + sizeof(struct program_image_section) * nhdrs);
memset(desc, '\0', sizeof(struct program_load_desc)
+ sizeof(struct program_image_section) * nhdrs);
desc->shell_path[0] = '\0'; desc->shell_path[0] = '\0';
fseek(fp, hdr.e_phoff, SEEK_SET); fseek(fp, hdr.e_phoff, SEEK_SET);
j = 0; j = 0;
@ -243,6 +251,8 @@ struct program_load_desc *load_elf(FILE *fp, char **interp_pathp)
} }
desc->pid = getpid(); desc->pid = getpid();
desc->pgid = getpgid(0); desc->pgid = getpgid(0);
if(*interp_pathp)
desc->reloc = hdr.e_type == ET_DYN;
desc->entry = hdr.e_entry; desc->entry = hdr.e_entry;
ioctl(fd, MCEXEC_UP_GET_CREDV, desc->cred); ioctl(fd, MCEXEC_UP_GET_CREDV, desc->cred);
desc->at_phdr = load_addr + hdr.e_phoff; desc->at_phdr = load_addr + hdr.e_phoff;
@ -478,7 +488,7 @@ retry:
} }
if ((sb.st_mode & S_IFMT) == S_IFLNK) { if ((sb.st_mode & S_IFMT) == S_IFLNK) {
char *link_path = malloc(max_len); link_path = malloc(max_len);
if (!link_path) { if (!link_path) {
fprintf(stderr, "lookup_exec_path(): error allocating\n"); fprintf(stderr, "lookup_exec_path(): error allocating\n");
return ENOMEM; return ENOMEM;
@ -489,9 +499,18 @@ retry:
fprintf(stderr, "lookup_exec_path(): error readlink\n"); fprintf(stderr, "lookup_exec_path(): error readlink\n");
return EINVAL; return EINVAL;
} }
link_path[error] = '\0';
__dprintf("lookup_exec_path(): %s is link -> %s\n", path, link_path); __dprintf("lookup_exec_path(): %s is link -> %s\n", path, link_path);
if(link_path[0] != '/'){
char *t = strrchr(path, '/');
if(t){
t++;
strcpy(t, link_path);
strcpy(link_path, path);
}
}
filename = link_path; filename = link_path;
goto retry; goto retry;
} }
@ -635,9 +654,6 @@ int load_elf_desc(char *filename, struct program_load_desc **desc_p,
return 0; return 0;
} }
#define PAGE_SIZE 4096
#define PAGE_MASK ~((unsigned long)PAGE_SIZE - 1)
void transfer_image(int fd, struct program_load_desc *desc) void transfer_image(int fd, struct program_load_desc *desc)
{ {
struct remote_transfer pt; struct remote_transfer pt;
@ -659,6 +675,7 @@ void transfer_image(int fd, struct program_load_desc *desc)
desc->sections[i].offset, flen); desc->sections[i].offset, flen);
while (s < e) { while (s < e) {
memset(&pt, '\0', sizeof pt);
pt.rphys = rpa; pt.rphys = rpa;
pt.userp = dma_buf; pt.userp = dma_buf;
pt.size = PAGE_SIZE; pt.size = PAGE_SIZE;
@ -762,7 +779,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
} }
/* Count full length */ /* Count full length */
full_len = sizeof(int) + sizeof(char *); // Counter and terminating NULL full_len = sizeof(long) + sizeof(char *); // Counter and terminating NULL
if (first) { if (first) {
full_len += sizeof(char *) + strlen(first) + 1; full_len += sizeof(char *) + strlen(first) + 1;
} }
@ -772,6 +789,8 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
full_len += sizeof(char *) + strlen(strings[string_i]) + 1; full_len += sizeof(char *) + strlen(strings[string_i]) + 1;
} }
full_len = (full_len + sizeof(long) - 1) & ~(sizeof(long) - 1);
_flat = (char *)malloc(full_len); _flat = (char *)malloc(full_len);
if (!_flat) { if (!_flat) {
return 0; return 0;
@ -780,14 +799,14 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
memset(_flat, 0, full_len); memset(_flat, 0, full_len);
/* Number of strings */ /* Number of strings */
*((int*)_flat) = nr_strings + (first ? 1 : 0); *((long *)_flat) = nr_strings + (first ? 1 : 0);
// Actual offset // Actual offset
flat_offset = sizeof(int) + sizeof(char *) * (nr_strings + 1 + flat_offset = sizeof(long) + sizeof(char *) * (nr_strings + 1 +
(first ? 1 : 0)); (first ? 1 : 0));
if (first) { if (first) {
*((char **)(_flat + sizeof(int))) = (void *)flat_offset; *((char **)(_flat + sizeof(long))) = (void *)flat_offset;
memcpy(_flat + flat_offset, first, strlen(first) + 1); memcpy(_flat + flat_offset, first, strlen(first) + 1);
flat_offset += strlen(first) + 1; flat_offset += strlen(first) + 1;
} }
@ -795,7 +814,7 @@ int flatten_strings(int nr_strings, char *first, char **strings, char **flat)
for (string_i = 0; string_i < nr_strings; ++string_i) { for (string_i = 0; string_i < nr_strings; ++string_i) {
/* Fabricate the string */ /* Fabricate the string */
*((char **)(_flat + sizeof(int) + (string_i + (first ? 1 : 0)) *((char **)(_flat + sizeof(long) + (string_i + (first ? 1 : 0))
* sizeof(char *))) = (void *)flat_offset; * sizeof(char *))) = (void *)flat_offset;
memcpy(_flat + flat_offset, strings[string_i], strlen(strings[string_i]) + 1); memcpy(_flat + flat_offset, strings[string_i], strlen(strings[string_i]) + 1);
flat_offset += strlen(strings[string_i]) + 1; flat_offset += strlen(strings[string_i]) + 1;
@ -878,6 +897,7 @@ sendsig(int sig, siginfo_t *siginfo, void *context)
remote_tid = -1; remote_tid = -1;
} }
memset(&sigdesc, '\0', sizeof sigdesc);
sigdesc.cpu = cpu; sigdesc.cpu = cpu;
sigdesc.pid = (int)pid; sigdesc.pid = (int)pid;
sigdesc.tid = remote_tid; sigdesc.tid = remote_tid;
@ -904,6 +924,7 @@ act_signalfd4(struct syscall_wait_desc *w)
switch(mode){ switch(mode){
case 0: /* new signalfd */ case 0: /* new signalfd */
sfd = malloc(sizeof(struct sigfd)); sfd = malloc(sizeof(struct sigfd));
memset(sfd, '\0', sizeof(struct sigfd));
tmp = w->sr.args[1]; tmp = w->sr.args[1];
flags = 0; flags = 0;
if(tmp & SFD_NONBLOCK) if(tmp & SFD_NONBLOCK)
@ -1069,6 +1090,82 @@ void init_worker_threads(int fd)
pthread_barrier_wait(&init_ready); pthread_barrier_wait(&init_ready);
} }
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
#define READ_BUFSIZE 1024
static int isunshare(void)
{
int err = 0;
int ret;
int fd;
char proc_path[PATH_MAX];
ssize_t len_read;
char buf_read[READ_BUFSIZE + 1];
char *buf_read_off;
char *buf_find;
char buf_cmp[READ_BUFSIZE + 1];
char *buf_cmp_off;
ssize_t len_copy;
snprintf(proc_path, sizeof(proc_path), "/proc/%d/mounts", getpid());
fd = open(proc_path, O_RDONLY);
if (fd < 0) {
fprintf(stderr, "Error: Failed to open %s.\n", proc_path);
return -1;
}
buf_cmp_off = buf_cmp;
while (1) {
len_read = read(fd, buf_read, READ_BUFSIZE);
if (len_read == -1) {
fprintf(stderr, "Error: Failed to read.\n");
err = -1;
break;
}
buf_read_off = buf_read;
while (1) {
if ((len_read - (buf_read_off - buf_read)) <= 0) {
break;
}
buf_find = memchr(buf_read_off, '\n',
len_read - (buf_read_off - buf_read));
if (buf_find) {
len_copy = buf_find - buf_read_off;
} else {
len_copy = len_read - (buf_read_off - buf_read);
}
memcpy(buf_cmp_off, buf_read_off, len_copy);
*(buf_cmp_off + len_copy) = '\0';
if (buf_find) {
buf_read_off = buf_read_off + len_copy + 1;
buf_cmp_off = buf_cmp;
ret = strncmp(buf_cmp, "mcoverlay /proc ", 16);
if (!ret) {
err = 1;
break;
}
} else {
buf_read_off = buf_read_off + len_copy;
buf_cmp_off = buf_cmp_off + len_copy;
break;
}
}
if (err == 1 || len_read == 0) {
break;
}
}
close(fd);
__dprintf("err=%d\n", err);
return err;
}
#endif
#endif // ENABLE_MCOVERLAYFS
#define MCK_RLIMIT_AS 0 #define MCK_RLIMIT_AS 0
#define MCK_RLIMIT_CORE 1 #define MCK_RLIMIT_CORE 1
#define MCK_RLIMIT_CPU 2 #define MCK_RLIMIT_CPU 2
@ -1139,6 +1236,24 @@ static int rlimits[] = {
char dev[64]; char dev[64];
static struct option mcexec_options[] = {
{
.name = "disable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 0,
},
{
.name = "enable-vdso",
.has_arg = no_argument,
.flag = &enable_vdso,
.val = 1,
},
/* end */
{ NULL, 0, NULL, 0, },
};
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
// int fd; // int fd;
@ -1190,12 +1305,15 @@ int main(int argc, char **argv)
} }
/* Parse options ("+" denotes stop at the first non-option) */ /* Parse options ("+" denotes stop at the first non-option) */
while ((opt = getopt(argc, argv, "+c:")) != -1) { while ((opt = getopt_long(argc, argv, "+c:", mcexec_options, NULL)) != -1) {
switch (opt) { switch (opt) {
case 'c': case 'c':
target_core = atoi(optarg); target_core = atoi(optarg);
break; break;
case 0: /* long opt */
break;
default: /* '?' */ default: /* '?' */
print_usage(argv); print_usage(argv);
exit(EXIT_FAILURE); exit(EXIT_FAILURE);
@ -1234,6 +1352,60 @@ int main(int argc, char **argv)
return 1; return 1;
} }
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
__dprintf("mcoverlay enable\n");
char mcos_procdir[PATH_MAX];
char mcos_sysdir[PATH_MAX];
error = isunshare();
if (error == 0) {
struct sys_unshare_desc unshare_desc;
struct sys_mount_desc mount_desc;
memset(&unshare_desc, '\0', sizeof unshare_desc);
memset(&mount_desc, '\0', sizeof mount_desc);
unshare_desc.unshare_flags = CLONE_NEWNS;
if (ioctl(fd, MCEXEC_UP_SYS_UNSHARE,
(unsigned long)&unshare_desc) != 0) {
fprintf(stderr, "Error: Failed to unshare. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_procdir, "/tmp/mcos/mcos%d_proc", mcosid);
mount_desc.dev_name = mcos_procdir;
mount_desc.dir_name = "/proc";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /proc. (%s)\n",
strerror(errno));
return 1;
}
sprintf(mcos_sysdir, "/tmp/mcos/mcos%d_sys", mcosid);
mount_desc.dev_name = mcos_sysdir;
mount_desc.dir_name = "/sys";
mount_desc.type = NULL;
mount_desc.flags = MS_BIND;
mount_desc.data = NULL;
if (ioctl(fd, MCEXEC_UP_SYS_MOUNT,
(unsigned long)&mount_desc) != 0) {
fprintf(stderr, "Error: Failed to mount /sys. (%s)\n",
strerror(errno));
return 1;
}
} else if (error == -1) {
return 1;
}
#endif
#else
__dprintf("mcoverlay disable\n");
#endif // ENABLE_MCOVERLAYFS
if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) { if (lookup_exec_path(argv[optind], path, sizeof(path)) != 0) {
fprintf(stderr, "error: finding file: %s\n", argv[optind]); fprintf(stderr, "error: finding file: %s\n", argv[optind]);
return 1; return 1;
@ -1272,6 +1444,8 @@ int main(int argc, char **argv)
//print_flat(args); //print_flat(args);
desc->cpu = target_core; desc->cpu = target_core;
desc->enable_vdso = enable_vdso;
p = getenv(rlimit_stack_envname); p = getenv(rlimit_stack_envname);
if (p) { if (p) {
errno = 0; errno = 0;
@ -1401,6 +1575,7 @@ void do_syscall_return(int fd, int cpu,
{ {
struct syscall_ret_desc desc; struct syscall_ret_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu; desc.cpu = cpu;
desc.ret = ret; desc.ret = ret;
desc.src = src; desc.src = src;
@ -1417,6 +1592,7 @@ void do_syscall_load(int fd, int cpu, unsigned long dest, unsigned long src,
{ {
struct syscall_load_desc desc; struct syscall_load_desc desc;
memset(&desc, '\0', sizeof desc);
desc.cpu = cpu; desc.cpu = cpu;
desc.src = src; desc.src = src;
desc.dest = dest; desc.dest = dest;
@ -1466,6 +1642,7 @@ static long do_strncpy_from_user(int fd, void *dest, void *src, unsigned long n)
struct strncpy_from_user_desc desc; struct strncpy_from_user_desc desc;
int ret; int ret;
memset(&desc, '\0', sizeof desc);
desc.dest = dest; desc.dest = dest;
desc.src = src; desc.src = src;
desc.n = n; desc.n = n;
@ -1560,6 +1737,11 @@ int close_cloexec_fds(int mcos_fd)
char * char *
chgpath(char *in, char *buf) chgpath(char *in, char *buf)
{ {
#ifdef ENABLE_MCOVERLAYFS
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0) && LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
return in;
#endif
#endif // ENABLE_MCOVERLAYFS
char *fn = in; char *fn = in;
struct stat sb; struct stat sb;
@ -1589,10 +1771,11 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
char *fn; char *fn;
int sig; int sig;
int term; int term;
struct timeval tv; struct timespec tv;
char pathbuf[PATH_MAX]; char pathbuf[PATH_MAX];
char tmpbuf[PATH_MAX]; char tmpbuf[PATH_MAX];
memset(&w, '\0', sizeof w);
w.cpu = cpu; w.cpu = cpu;
w.pid = getpid(); w.pid = getpid();
@ -1628,13 +1811,13 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
break; break;
case __NR_futex: case __NR_futex:
ret = gettimeofday(&tv, NULL); ret = clock_gettime(w.sr.args[1], &tv);
SET_ERR(ret); SET_ERR(ret);
__dprintf("gettimeofday=%016ld,%09ld\n", __dprintf("clock_gettime=%016ld,%09ld\n",
tv.tv_sec, tv.tv_sec,
tv.tv_usec); tv.tv_nsec);
do_syscall_return(fd, cpu, ret, 1, (unsigned long)&tv, do_syscall_return(fd, cpu, ret, 1, (unsigned long)&tv,
w.sr.args[0], sizeof(struct timeval)); w.sr.args[0], sizeof(struct timespec));
break; break;
case __NR_kill: // interrupt syscall case __NR_kill: // interrupt syscall
@ -1690,17 +1873,12 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
return w.sr.args[0]; return w.sr.args[0];
case __NR_mmap: case __NR_mmap:
case __NR_munmap:
case __NR_mprotect: case __NR_mprotect:
/* reserved for internal use */ /* reserved for internal use */
do_syscall_return(fd, cpu, -ENOSYS, 0, 0, 0, 0); do_syscall_return(fd, cpu, -ENOSYS, 0, 0, 0, 0);
break; break;
case __NR_munmap:
ret = madvise((void *)w.sr.args[0], w.sr.args[1], MADV_DONTNEED);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
#ifdef USE_SYSCALL_MOD_CALL #ifdef USE_SYSCALL_MOD_CALL
case 303:{ case 303:{
__dprintf("mcexec.c,mod_cal,mod=%ld,cmd=%ld\n", w.sr.args[0], w.sr.args[1]); __dprintf("mcexec.c,mod_cal,mod=%ld,cmd=%ld\n", w.sr.args[0], w.sr.args[1]);
@ -1734,6 +1912,7 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
struct fork_sync_container *fsc; struct fork_sync_container *fsc;
struct fork_sync_container *fp; struct fork_sync_container *fp;
struct fork_sync_container *fb; struct fork_sync_container *fb;
int flag = w.sr.args[0];
int rc = -1; int rc = -1;
pid_t pid; pid_t pid;
@ -1753,7 +1932,41 @@ int main_loop(int fd, int cpu, pthread_mutex_t *lock)
memset(fs, '\0', sizeof(struct fork_sync)); memset(fs, '\0', sizeof(struct fork_sync));
sem_init(&fs->sem, 1, 0); sem_init(&fs->sem, 1, 0);
pid = fork(); if(flag){
int pipefds[2];
if(pipe(pipefds) == -1){
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
pid = fork();
if(pid == 0){
close(pipefds[0]);
pid = fork();
if(pid != 0){
write(pipefds[1], &pid, sizeof pid);
exit(0);
}
}
else if(pid != -1){
int npid;
int st;
close(pipefds[1]);
read(pipefds[0], &npid, sizeof npid);
close(pipefds[0]);
waitpid(pid, &st, 0);
pid = npid;
}
else{
rc = -errno;
sem_destroy(&fs->sem);
goto fork_err;
}
}
else
pid = fork();
switch (pid) { switch (pid) {
/* Error */ /* Error */
@ -1896,12 +2109,13 @@ fork_err:
char path[1024]; char path[1024];
char *filename; char *filename;
int ret; int ret;
char *shell = NULL; char *shell;
char shell_path[1024]; char shell_path[1024];
/* Load descriptor phase */ /* Load descriptor phase */
case 1: case 1:
shell = NULL;
filename = (char *)w.sr.args[1]; filename = (char *)w.sr.args[1];
if ((ret = lookup_exec_path(filename, path, sizeof(path))) if ((ret = lookup_exec_path(filename, path, sizeof(path)))
@ -1979,6 +2193,7 @@ return_execve1:
fprintf(stderr, "execve(): error allocating desc\n"); fprintf(stderr, "execve(): error allocating desc\n");
goto return_execve2; goto return_execve2;
} }
memset(desc, '\0', w.sr.args[2]);
/* Copy descriptor from co-kernel side */ /* Copy descriptor from co-kernel side */
trans.userp = (void*)desc; trans.userp = (void*)desc;
@ -2021,6 +2236,11 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break; break;
case __NR_perf_event_open:
ret = open("/dev/null", O_RDONLY);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
case __NR_rt_sigaction: case __NR_rt_sigaction:
act_sigaction(&w); act_sigaction(&w);
do_syscall_return(fd, cpu, 0, 0, 0, 0, 0); do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
@ -2050,6 +2270,25 @@ return_execve2:
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break; break;
case __NR_readlink:
ret = do_strncpy_from_user(fd, pathbuf, (void *)w.sr.args[0], PATH_MAX);
if (ret >= PATH_MAX) {
ret = -ENAMETOOLONG;
}
if (ret < 0) {
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
}
fn = chgpath(pathbuf, tmpbuf);
ret = readlink(fn, (char *)w.sr.args[1], w.sr.args[2]);
__dprintf("readlink: path=%s, buf=%s, ret=%ld\n",
fn, (char *)w.sr.args[1], ret);
SET_ERR(ret);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);
break;
default: default:
ret = do_generic_syscall(&w); ret = do_generic_syscall(&w);
do_syscall_return(fd, cpu, ret, 0, 0, 0, 0); do_syscall_return(fd, cpu, ret, 0, 0, 0, 0);

View File

@ -3,7 +3,7 @@ OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o
DEPSRCS=$(wildcard $(SRC)/*.c) DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ CFLAGS += -I$(SRC)/include -D__KERNEL__
CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS) CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS)
ifeq ("$(DCFA_MODE)", "kmod") ifeq ("$(DCFA_MODE)", "kmod")

View File

@ -3,10 +3,10 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR) IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c) DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__ -g CFLAGS += -I$(SRC)/include -D__KERNEL__ -g
LDFLAGS += -e arch_start LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o IHKOBJ = ihk/ihk.o

View File

@ -28,19 +28,19 @@
int num_processors = 1; int num_processors = 1;
static volatile int ap_stop = 1; static volatile int ap_stop = 1;
extern void zero_tsc(void);
static void ap_wait(void) static void ap_wait(void)
{ {
init_tick();
while (ap_stop) { while (ap_stop) {
barrier(); barrier();
cpu_pause(); cpu_pause();
} }
sync_tick();
zero_tsc();
kmalloc_init(); kmalloc_init();
sched_init(); sched_init();
arch_start_pvclock();
if (find_command_line("hidos")) { if (find_command_line("hidos")) {
init_host_syscall_channel(); init_host_syscall_channel();
@ -56,7 +56,9 @@ static void ap_wait(void)
void ap_start(void) void ap_start(void)
{ {
init_tick();
ap_stop = 0; ap_stop = 0;
sync_tick();
} }
void ap_init(void) void ap_init(void)
@ -66,6 +68,7 @@ void ap_init(void)
int bsp_hw_id; int bsp_hw_id;
ihk_mc_init_ap(); ihk_mc_init_ap();
init_delay();
cpu_info = ihk_mc_get_cpu_info(); cpu_info = ihk_mc_get_cpu_info();
bsp_hw_id = ihk_mc_get_hardware_processor_id(); bsp_hw_id = ihk_mc_get_hardware_processor_id();
@ -89,3 +92,140 @@ void ap_init(void)
kprintf("AP Booting: Done\n"); kprintf("AP Booting: Done\n");
} }
#include <sysfs.h>
#include <kmalloc.h>
#include <string.h>
#include <vsprintf.h>
static ssize_t
show_int(struct sysfs_ops *ops, void *instance, void *buf, size_t size)
{
int *p = instance;
return snprintf(buf, size, "%d\n", *p);
}/* show_int() */
struct sysfs_ops show_int_ops = {
.show = &show_int,
};
struct fake_cpu_info {
int online;
};
static struct fake_cpu_info *fake_cpu_infos = NULL;
enum fake_cpu_info_member {
ONLINE,
};
struct fake_cpu_info_ops {
enum fake_cpu_info_member member;
struct sysfs_ops ops;
};
static ssize_t
show_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
n = snprintf(buf, size, "%d\n", info->online);
break;
default:
n = -EINVAL;
break;
}
if (n >= size) {
n = -ENOSPC;
}
return n;
} /* show_fake_cpu_info() */
static ssize_t
store_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
kprintf("NYI:store_fake_cpu_info(%p,%p,%p,%ld): "
"online %d --> \"%.*s\"\n",
ops0, instance, buf, size, info->online,
(int)size, buf);
n = size;
break;
default:
n = -EIO;
break;
}
return n;
} /* store_fake_cpu_info() */
static struct fake_cpu_info_ops show_fci_online = {
.member = ONLINE,
.ops.show = &show_fake_cpu_info,
.ops.store = &store_fake_cpu_info,
};
void
cpu_sysfs_setup(void)
{
int error;
int cpu;
sysfs_handle_t targeth;
struct fake_cpu_info *info;
/* sample of simple variable **********************************/
error = sysfs_createf(&show_int_ops, &num_processors, 0444,
"/sys/devices/system/cpu/num_processors");
if (error) {
panic("cpu_sysfs_setup:sysfs_createf(num_processors) failed\n");
}
/* sample of more complex variable ****************************/
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 10+cpu;
}
fake_cpu_infos = info;
/* setup sysfs tree */
for (cpu = 0; cpu < num_processors; ++cpu) {
/* online */
error = sysfs_createf(&show_fci_online.ops,
&fake_cpu_infos[cpu], 0644,
"/sys/devices/system/cpu/cpu%d/online", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_createf failed\n");
}
/* link to cpu%d */
error = sysfs_lookupf(&targeth,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_lookupf failed\n");
}
error = sysfs_symlinkf(targeth, "/sys/bus/cpu/devices/cpu%d",
cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_symlinkf failed\n");
}
}
return;
} /* cpu_sysfs_setup() */

View File

@ -1,6 +1,5 @@
CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -3,6 +3,5 @@ LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump
OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/builtin-x86.lds LDFLAGS += -T $(SRC)/config/builtin-x86.lds

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/smp-x86.lds LDFLAGS += -T $(SRC)/config/smp-x86.lds

View File

@ -22,13 +22,44 @@ extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern int sprintf(char * buf, const char *fmt, ...); extern int sprintf(char * buf, const char *fmt, ...);
static ihk_spinlock_t kmsg_lock; static ihk_spinlock_t kmsg_lock;
static unsigned long kprintf_lock_head(void);
static void kprintf_unlock_head(unsigned long irqflags);
static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
int head, tail, buf_len, mode, adj;
mode = kmsg_buf.mode;
while (1) {
adj = 0;
tail = kmsg_buf.tail;
buf_len = kmsg_buf.len;
head = kmsg_buf.head;
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
if (mode != 1) {
*slide = 1;
break;
} else {
kprintf_unlock_head(*flags_head);
*flags_head = kprintf_lock_head();
}
} else {
break;
}
}
}
/* TODO: lock */ /* TODO: lock */
void kputs(char *buf) void kputs(char *buf)
{ {
int len = strlen(buf); int len = strlen(buf);
unsigned long flags; int slide = 0;
unsigned long flags_tail, flags_head;
flags = __ihk_mc_spinlock_lock(&kmsg_lock); flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
if (len + kmsg_buf.tail > kmsg_buf.len) { if (len + kmsg_buf.tail > kmsg_buf.len) {
kmsg_buf.tail = 0; kmsg_buf.tail = 0;
@ -39,8 +70,12 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len); memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len; kmsg_buf.tail += len;
if (slide == 1) {
__ihk_mc_spinlock_unlock(&kmsg_lock, flags); kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
} }
#define KPRINTF_LOCAL_BUF_LEN 1024 #define KPRINTF_LOCAL_BUF_LEN 1024
@ -55,11 +90,23 @@ void kprintf_unlock(unsigned long irqflags)
__ihk_mc_spinlock_unlock(&kmsg_lock, irqflags); __ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
} }
static unsigned long kprintf_lock_head(void)
{
return __ihk_mc_spinlock_lock(&kmsg_buf.lock);
}
static void kprintf_unlock_head(unsigned long irqflags)
{
__ihk_mc_spinlock_unlock(&kmsg_buf.lock, irqflags);
}
/* Caller must hold kmsg_lock! */ /* Caller must hold kmsg_lock! */
int __kprintf(const char *format, ...) int __kprintf(const char *format, ...)
{ {
int len = 0; int len = 0;
int slide = 0;
va_list va; va_list va;
unsigned long flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN]; char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */ /* Copy into the local buf */
@ -67,6 +114,9 @@ int __kprintf(const char *format, ...)
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va); len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va); va_end(va);
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */ /* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) { if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0; kmsg_buf.tail = 0;
@ -74,25 +124,33 @@ int __kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len); memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len; kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
return len; return len;
} }
int kprintf(const char *format, ...) int kprintf(const char *format, ...)
{ {
int len = 0; int len = 0;
int slide = 0;
va_list va; va_list va;
unsigned long flags; unsigned long flags_tail, flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN]; char buf[KPRINTF_LOCAL_BUF_LEN];
flags = __ihk_mc_spinlock_lock(&kmsg_lock);
/* Copy into the local buf */ /* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id()); len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format); va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va); len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va); va_end(va);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */ /* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) { if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0; kmsg_buf.tail = 0;
@ -100,16 +158,24 @@ int kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len); memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len; kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
__ihk_mc_spinlock_unlock(&kmsg_lock, flags); kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
return len; return len;
} }
void kmsg_init(void) void kmsg_init(int mode)
{ {
ihk_mc_spinlock_init(&kmsg_lock); ihk_mc_spinlock_init(&kmsg_lock);
kmsg_buf.tail = 0; kmsg_buf.tail = 0;
kmsg_buf.len = sizeof(kmsg_buf.str); kmsg_buf.len = sizeof(kmsg_buf.str);
kmsg_buf.head = 0;
kmsg_buf.mode = mode;
ihk_mc_spinlock_init(&kmsg_buf.lock);
memset(kmsg_buf.str, 0, kmsg_buf.len); memset(kmsg_buf.str, 0, kmsg_buf.len);
} }

View File

@ -47,6 +47,7 @@ static memobj_get_page_func_t fileobj_get_page;
static memobj_copy_page_func_t fileobj_copy_page; static memobj_copy_page_func_t fileobj_copy_page;
static memobj_flush_page_func_t fileobj_flush_page; static memobj_flush_page_func_t fileobj_flush_page;
static memobj_invalidate_page_func_t fileobj_invalidate_page; static memobj_invalidate_page_func_t fileobj_invalidate_page;
static memobj_lookup_page_func_t fileobj_lookup_page;
static struct memobj_ops fileobj_ops = { static struct memobj_ops fileobj_ops = {
.release = &fileobj_release, .release = &fileobj_release,
@ -55,6 +56,7 @@ static struct memobj_ops fileobj_ops = {
.copy_page = &fileobj_copy_page, .copy_page = &fileobj_copy_page,
.flush_page = &fileobj_flush_page, .flush_page = &fileobj_flush_page,
.invalidate_page = &fileobj_invalidate_page, .invalidate_page = &fileobj_invalidate_page,
.lookup_page = &fileobj_lookup_page,
}; };
static struct fileobj *to_fileobj(struct memobj *memobj) static struct fileobj *to_fileobj(struct memobj *memobj)
@ -609,3 +611,37 @@ out:
memobj, phys, pgsize, error); memobj, phys, pgsize, error);
return error; return error;
} }
static int fileobj_lookup_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *pflag)
{
struct fileobj *obj = to_fileobj(memobj);
int error;
uintptr_t phys = -1;
struct page *page;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p)\n", obj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (p2align != PAGE_P2ALIGN) {
error = -ENOMEM;
goto out;
}
page = page_list_lookup(obj, off);
if (!page) {
error = -ENOENT;
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): page not found. %d\n", obj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("fileobj_lookup_page(%p,%lx,%x,%p): %d %lx\n",
obj, off, p2align, physp, error, phys);
return error;
}

View File

@ -153,7 +153,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
*/ */
static void get_futex_key_refs(union futex_key *key) static void get_futex_key_refs(union futex_key *key)
{ {
/* RIKEN: only !fshared futexes... */ /* RIKEN: no swapping in McKernel */
return; return;
} }
@ -163,7 +163,7 @@ static void get_futex_key_refs(union futex_key *key)
*/ */
static void drop_futex_key_refs(union futex_key *key) static void drop_futex_key_refs(union futex_key *key)
{ {
/* RIKEN: only !fshared futexes... */ /* RIKEN: no swapping in McKernel */
return; return;
} }
/** /**
@ -185,6 +185,7 @@ static int
get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key) get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
{ {
unsigned long address = (unsigned long)uaddr; unsigned long address = (unsigned long)uaddr;
unsigned long phys;
struct process_vm *mm = cpu_local_var(current)->vm; struct process_vm *mm = cpu_local_var(current)->vm;
/* /*
@ -203,15 +204,31 @@ get_futex_key(uint32_t *uaddr, int fshared, union futex_key *key)
* but access_ok() should be faster than find_vma() * but access_ok() should be faster than find_vma()
*/ */
if (!fshared) { if (!fshared) {
key->private.mm = mm; key->private.mm = mm;
key->private.address = address; key->private.address = address;
get_futex_key_refs(key); get_futex_key_refs(key);
return 0; return 0;
} }
/* RIKEN: No shared futex support... */ key->both.offset |= FUT_OFF_MMSHARED;
return -EFAULT;
retry_v2p:
/* Just use physical address of page, McKernel does not do swapping */
if (ihk_mc_pt_virt_to_phys(mm->address_space->page_table,
(void *)uaddr, &phys)) {
/* Check if we can fault in page */
if (page_fault_process_vm(mm, uaddr, PF_POPULATE | PF_WRITE | PF_USER)) {
kprintf("error: get_futex_key() virt to phys translation failed\n");
return -EFAULT;
}
goto retry_v2p;
}
key->shared.phys = (void *)phys;
key->shared.pgoff = 0;
return 0;
} }
@ -234,7 +251,7 @@ static int cmpxchg_futex_value_locked(uint32_t __user *uaddr, uint32_t uval, uin
static int get_futex_value_locked(uint32_t *dest, uint32_t *from) static int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{ {
/* RIKEN: futexes are always on not swappable pages */ /* RIKEN: futexes are always on not swappable pages */
*dest = *from; *dest = getint_user((int *)from);
return 0; return 0;
} }
@ -265,6 +282,7 @@ static void wake_futex(struct futex_q *q)
barrier(); barrier();
q->lock_ptr = NULL; q->lock_ptr = NULL;
dkprintf("wake_futex(): waking up tid %d\n", p->tid);
sched_wakeup_thread(p, PS_NORMAL); sched_wakeup_thread(p, PS_NORMAL);
} }
@ -667,12 +685,16 @@ static uint64_t futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q
/* RIKEN: use mcos timers */ /* RIKEN: use mcos timers */
if (timeout) { if (timeout) {
dkprintf("futex_wait_queue_me(): tid: %d schedule_timeout()\n", cpu_local_var(current)->tid);
time_remain = schedule_timeout(timeout); time_remain = schedule_timeout(timeout);
} }
else { else {
dkprintf("futex_wait_queue_me(): tid: %d schedule()\n", cpu_local_var(current)->tid);
schedule(); schedule();
time_remain = 0; time_remain = 0;
} }
dkprintf("futex_wait_queue_me(): tid: %d woken up\n", cpu_local_var(current)->tid);
} }
/* This does not need to be serialized */ /* This does not need to be serialized */
@ -777,10 +799,10 @@ retry:
if (timeout && !time_remain) if (timeout && !time_remain)
goto out_put_key; goto out_put_key;
if(hassigpending(cpu_local_var(current))){ if (hassigpending(cpu_local_var(current))) {
ret = -EINTR; ret = -EINTR;
goto out_put_key; goto out_put_key;
} }
/* RIKEN: no signals */ /* RIKEN: no signals */
put_futex_key(fshared, &q.key); put_futex_key(fshared, &q.key);
@ -793,17 +815,10 @@ out:
} }
int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout, int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
uint32_t *uaddr2, uint32_t val2, uint32_t val3) uint32_t *uaddr2, uint32_t val2, uint32_t val3, int fshared)
{ {
int clockrt, ret = -ENOSYS; int clockrt, ret = -ENOSYS;
int cmd = op & FUTEX_CMD_MASK; int cmd = op & FUTEX_CMD_MASK;
int fshared = 0;
/* RIKEN: Assume address space private futexes.
if (!(op & FUTEX_PRIVATE_FLAG)) {
fshared = 1;
}
*/
clockrt = op & FUTEX_CLOCK_REALTIME; clockrt = op & FUTEX_CLOCK_REALTIME;
if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
@ -824,8 +839,7 @@ int futex(uint32_t *uaddr, int op, uint32_t val, uint64_t timeout,
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
break; break;
case FUTEX_CMP_REQUEUE: case FUTEX_CMP_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 0);
0);
break; break;
case FUTEX_WAKE_OP: case FUTEX_WAKE_OP:
ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);

View File

@ -30,6 +30,7 @@
#include <mman.h> #include <mman.h>
#include <init.h> #include <init.h>
#include <kmalloc.h> #include <kmalloc.h>
#include <sysfs.h>
//#define DEBUG_PRINT_HOST //#define DEBUG_PRINT_HOST
@ -84,15 +85,17 @@ int prepare_process_ranges_args_envs(struct thread *thread,
struct process *proc = thread->proc; struct process *proc = thread->proc;
struct process_vm *vm = proc->vm; struct process_vm *vm = proc->vm;
struct address_space *as = vm->address_space; struct address_space *as = vm->address_space;
long aout_base;
int error;
n = p->num_sections; n = p->num_sections;
aout_base = (pn->reloc)? vm->region.map_end: 0;
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) { if (pn->sections[i].interp && (interp_nbase == (uintptr_t)-1)) {
interp_obase = pn->sections[i].vaddr; interp_obase = pn->sections[i].vaddr;
interp_obase -= (interp_obase % pn->interp_align); interp_obase -= (interp_obase % pn->interp_align);
interp_nbase = vm->region.map_start; interp_nbase = vm->region.map_end;
interp_nbase = (interp_nbase + pn->interp_align - 1) interp_nbase = (interp_nbase + pn->interp_align - 1)
& ~(pn->interp_align - 1); & ~(pn->interp_align - 1);
} }
@ -102,6 +105,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
pn->sections[i].vaddr += interp_nbase; pn->sections[i].vaddr += interp_nbase;
p->sections[i].vaddr = pn->sections[i].vaddr; p->sections[i].vaddr = pn->sections[i].vaddr;
} }
else{
pn->sections[i].vaddr += aout_base;
p->sections[i].vaddr = pn->sections[i].vaddr;
}
s = (pn->sections[i].vaddr) & PAGE_MASK; s = (pn->sections[i].vaddr) & PAGE_MASK;
e = (pn->sections[i].vaddr + pn->sections[i].len e = (pn->sections[i].vaddr + pn->sections[i].len
+ PAGE_SIZE - 1) & PAGE_MASK; + PAGE_SIZE - 1) & PAGE_MASK;
@ -117,7 +124,8 @@ int prepare_process_ranges_args_envs(struct thread *thread,
} }
up = virt_to_phys(up_v); up = virt_to_phys(up_v);
if (add_process_memory_range(vm, s, e, up, flags, NULL, 0) != 0) { if (add_process_memory_range(vm, s, e, up, flags, NULL, 0,
PAGE_SHIFT) != 0) {
ihk_mc_free_pages(up_v, range_npages); ihk_mc_free_pages(up_v, range_npages);
kprintf("ERROR: adding memory range for ELF section %i\n", i); kprintf("ERROR: adding memory range for ELF section %i\n", i);
goto err; goto err;
@ -170,6 +178,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
(e > vm->region.data_end ? (e > vm->region.data_end ?
e : vm->region.data_end); e : vm->region.data_end);
} }
if (aout_base) {
vm->region.map_end = e;
}
} }
if (interp_nbase != (uintptr_t)-1) { if (interp_nbase != (uintptr_t)-1) {
@ -181,6 +193,11 @@ int prepare_process_ranges_args_envs(struct thread *thread,
pn->entry); pn->entry);
} }
if (aout_base) {
pn->at_phdr += aout_base;
pn->at_entry += aout_base;
}
vm->region.brk_start = vm->region.brk_end = vm->region.data_end; vm->region.brk_start = vm->region.brk_end = vm->region.data_end;
/* Map, copy and update args and envs */ /* Map, copy and update args and envs */
@ -196,7 +213,7 @@ int prepare_process_ranges_args_envs(struct thread *thread,
args_envs_p = virt_to_phys(args_envs); args_envs_p = virt_to_phys(args_envs);
if(add_process_memory_range(vm, addr, e, args_envs_p, if(add_process_memory_range(vm, addr, e, args_envs_p,
flags, NULL, 0) != 0){ flags, NULL, 0, PAGE_SHIFT) != 0){
ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT); ihk_mc_free_pages(args_envs, ARGENV_PAGE_COUNT);
kprintf("ERROR: adding memory range for args/envs\n"); kprintf("ERROR: adding memory range for args/envs\n");
goto err; goto err;
@ -227,9 +244,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
p->args_len = args_len; p->args_len = args_len;
} }
dkprintf("args copy, nr: %d\n", *((int*)args_envs_r)); dkprintf("args copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs, args_envs_r, p->args_len + 8); memcpy_long(args_envs, args_envs_r, p->args_len + sizeof(long) - 1);
/* Only unmap remote address if it wasn't specified as an argument */ /* Only unmap remote address if it wasn't specified as an argument */
if (!args) { if (!args) {
@ -262,9 +279,9 @@ int prepare_process_ranges_args_envs(struct thread *thread,
p->envs_len = envs_len; p->envs_len = envs_len;
} }
dkprintf("envs copy, nr: %d\n", *((int*)args_envs_r)); dkprintf("envs copy, nr: %d\n", *((long *)args_envs_r));
memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + 8); memcpy_long(args_envs + p->args_len, args_envs_r, p->envs_len + sizeof(long) - 1);
/* Only map remote address if it wasn't specified as an argument */ /* Only map remote address if it wasn't specified as an argument */
if (!envs) { if (!envs) {
@ -274,10 +291,10 @@ int prepare_process_ranges_args_envs(struct thread *thread,
flush_tlb(); flush_tlb();
// Update variables // Update variables
argc = *((int*)(args_envs)); argc = *((long *)(args_envs));
dkprintf("argc: %d\n", argc); dkprintf("argc: %d\n", argc);
argv = (char **)(args_envs + (sizeof(int))); argv = (char **)(args_envs + (sizeof(long)));
if(proc->saved_cmdline){ if(proc->saved_cmdline){
kfree(proc->saved_cmdline); kfree(proc->saved_cmdline);
proc->saved_cmdline_len = 0; proc->saved_cmdline_len = 0;
@ -294,20 +311,28 @@ int prepare_process_ranges_args_envs(struct thread *thread,
*a = (char *)addr + (unsigned long)*a; // Process' address space! *a = (char *)addr + (unsigned long)*a; // Process' address space!
} }
envc = *((int*)(args_envs + p->args_len)); envc = *((long *)(args_envs + p->args_len));
dkprintf("envc: %d\n", envc); dkprintf("envc: %d\n", envc);
env = (char **)(args_envs + p->args_len + sizeof(int)); env = (char **)(args_envs + p->args_len + sizeof(long));
while (*env) { while (*env) {
char **_env = env; char **_env = env;
//dkprintf("%s\n", args_envs + p->args_len + (unsigned long)*env); //dkprintf("%s\n", args_envs + p->args_len + (unsigned long)*env);
*env = (char *)addr + p->args_len + (unsigned long)*env; *env = (char *)addr + p->args_len + (unsigned long)*env;
env = ++_env; env = ++_env;
} }
env = (char **)(args_envs + p->args_len + sizeof(int)); env = (char **)(args_envs + p->args_len + sizeof(long));
dkprintf("env OK\n"); dkprintf("env OK\n");
if (pn->enable_vdso) {
error = arch_map_vdso(vm);
if (error) {
kprintf("ERROR: mapping vdso pages. %d\n", error);
goto err;
}
}
p->rprocess = (unsigned long)thread; p->rprocess = (unsigned long)thread;
p->rpgtable = virt_to_phys(as->page_table); p->rpgtable = virt_to_phys(as->page_table);
@ -379,11 +404,23 @@ static int process_msg_prepare_process(unsigned long rphys)
proc->egid = pn->cred[5]; proc->egid = pn->cred[5];
proc->sgid = pn->cred[6]; proc->sgid = pn->cred[6];
proc->fsgid = pn->cred[7]; proc->fsgid = pn->cred[7];
proc->termsig = SIGCHLD;
vm->region.user_start = pn->user_start; vm->region.user_start = pn->user_start;
vm->region.user_end = pn->user_end; vm->region.user_end = pn->user_end;
vm->region.map_start = (USER_END / 3) & LARGE_PAGE_MASK; if(vm->region.user_end > USER_END)
vm->region.map_end = proc->vm->region.map_start; vm->region.user_end = USER_END;
if(vm->region.user_start != 0UL ||
vm->region.user_end < TASK_UNMAPPED_BASE){
vm->region.map_start =
(vm->region.user_start +
(vm->region.user_end - vm->region.user_start) / 3) &
LARGE_PAGE_MASK;
}
else{
vm->region.map_start = TASK_UNMAPPED_BASE;
}
vm->region.map_end = vm->region.map_start;
memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX); memcpy(proc->rlimit, pn->rlimit, sizeof(struct rlimit) * MCK_RLIM_MAX);
/* TODO: Clear it at the proper timing */ /* TODO: Clear it at the proper timing */
@ -425,7 +462,7 @@ static void process_msg_init(struct ikc_scd_init_param *pcp, struct syscall_para
static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys) static void process_msg_init_acked(struct ihk_ikc_channel_desc *c, unsigned long pphys)
{ {
struct ikc_scd_init_param *param = (void *)pphys; struct ikc_scd_init_param *param = phys_to_virt(pphys);
struct syscall_params *lparam; struct syscall_params *lparam;
enum ihk_mc_pt_attribute attr; enum ihk_mc_pt_attribute attr;
@ -493,6 +530,31 @@ extern int runcount;
extern void terminate_host(int pid); extern void terminate_host(int pid);
extern void debug_log(long); extern void debug_log(long);
static void req_get_cpu_mapping(long req_rpa)
{
size_t mapsize;
size_t size;
int npages;
long phys;
struct get_cpu_mapping_req *req;
struct cpu_mapping *buf;
size = sizeof(*req);
mapsize = size + (req_rpa & (PAGE_SIZE - 1));
npages = (mapsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
phys = ihk_mc_map_memory(NULL, req_rpa, size);
req = ihk_mc_map_virtual(phys, npages, PTATTR_WRITABLE);
req->error = arch_get_cpu_mapping(&buf, &req->buf_elems);
if (!req->error) {
req->buf_rpa = virt_to_phys(buf);
}
ihk_mc_unmap_virtual(req, npages, 0);
ihk_mc_unmap_memory(NULL, phys, size);
return;
} /* req_get_cpu_mapping() */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c, static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *ihk_os) void *__packet, void *ihk_os)
{ {
@ -585,6 +647,30 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg); dkprintf("SCD_MSG_DEBUG_LOG code=%lx\n", packet->arg);
debug_log(packet->arg); debug_log(packet->arg);
return 0; return 0;
case SCD_MSG_SYSFS_REQ_SHOW:
case SCD_MSG_SYSFS_REQ_STORE:
case SCD_MSG_SYSFS_REQ_RELEASE:
sysfss_packet_handler(c, packet->msg, packet->err,
packet->sysfs_arg1, packet->sysfs_arg2,
packet->sysfs_arg3);
return 0;
case SCD_MSG_GET_CPU_MAPPING:
req_get_cpu_mapping(packet->arg);
pckt.msg = SCD_MSG_REPLY_GET_CPU_MAPPING;
pckt.arg = packet->arg;
syscall_channel_send(c, &pckt);
return 0;
default:
kprintf("syscall_pakcet_handler:unknown message "
"(%d.%d.%d.%d.%d.%#lx)\n",
packet->msg, packet->ref, packet->osnum,
packet->pid, packet->err, packet->arg);
return 0;
} }
return 0; return 0;
} }

View File

@ -1,6 +1,8 @@
#ifndef _LINUX_AUXVEC_H #ifndef _LINUX_AUXVEC_H
#define _LINUX_AUXVEC_H #define _LINUX_AUXVEC_H
#include <arch/auxvec.h>
/* Symbolic values for the entries in the auxiliary table /* Symbolic values for the entries in the auxiliary table
put on the initial stack */ put on the initial stack */
#define AT_NULL 0 /* end of vector */ #define AT_NULL 0 /* end of vector */

View File

@ -39,7 +39,7 @@ extern ihk_spinlock_t cpu_status_lock;
struct cpu_local_var { struct cpu_local_var {
/* malloc */ /* malloc */
struct malloc_header free_list; struct malloc_header free_list;
ihk_spinlock_t free_list_lock; struct malloc_header *remote_free_list;
struct thread idle; struct thread idle;
struct process idle_proc; struct process idle_proc;

View File

@ -99,6 +99,8 @@
#ifdef __KERNEL__ #ifdef __KERNEL__
#define __user
/* We don't deal with uaccess at the moment, because x86 can access /* We don't deal with uaccess at the moment, because x86 can access
* userspace directly, we rely on glibc and the app developers. * userspace directly, we rely on glibc and the app developers.
*/ */
@ -106,42 +108,14 @@
#include <arch/uaccess.h> #include <arch/uaccess.h>
#endif #endif
#include <asm.h>
#include <errno.h> #include <errno.h>
#include <arch-futex.h>
#define __user
#if 0 #if 0
#include <arch/processor.h> #include <arch/processor.h>
#include <arch/system.h> #include <arch/system.h>
#endif #endif
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
{ {
int op = (encoded_op >> 28) & 7; int op = (encoded_op >> 28) & 7;
@ -206,28 +180,6 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
return ret; return ret;
} }
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif // __KERNEL__ #endif // __KERNEL__
#endif // _ASM_X86_FUTEX_H #endif // _ASM_X86_FUTEX_H
@ -241,13 +193,11 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
struct process_vm; struct process_vm;
union futex_key { union futex_key {
#if 0
struct { struct {
unsigned long pgoff; unsigned long pgoff;
struct inode *inode; void *phys;
int offset; int offset;
} shared; } shared;
#endif
struct { struct {
unsigned long address; unsigned long address;
struct process_vm *mm; struct process_vm *mm;
@ -261,6 +211,7 @@ union futex_key {
}; };
#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } #define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } }
#define FUT_OFF_MMSHARED 2
extern int futex_init(void); extern int futex_init(void);
@ -272,7 +223,8 @@ futex(
uint64_t timeout, uint64_t timeout,
uint32_t __user * uaddr2, uint32_t __user * uaddr2,
uint32_t val2, uint32_t val2,
uint32_t val3 uint32_t val3,
int fshared
); );

View File

@ -0,0 +1,23 @@
/**
* \file rlimit.h
* License details are found in the file LICENSE.
* \brief
* Kinds of resource limit
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef __GENERIC_RLIMIT_H
#define __GENERIC_RLIMIT_H
typedef uint64_t rlim_t;
struct rlimit {
rlim_t rlim_cur; /* Soft limit */
rlim_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#endif

View File

@ -14,7 +14,7 @@
#define INIT_H #define INIT_H
extern void arch_init(void); extern void arch_init(void);
extern void kmsg_init(void); extern void kmsg_init(int);
extern void mem_init(void); extern void mem_init(void);
extern void ikc_master_init(void); extern void ikc_master_init(void);
extern void ap_init(void); extern void ap_init(void);
@ -28,6 +28,7 @@ extern void init_host_syscall_channel(void);
extern void init_host_syscall_channel2(void); extern void init_host_syscall_channel2(void);
extern void sched_init(void); extern void sched_init(void);
extern void pc_ap_init(void); extern void pc_ap_init(void);
extern void cpu_sysfs_setup(void);
extern char *find_command_line(char *name); extern char *find_command_line(char *name);

View File

@ -16,6 +16,6 @@
void kputs(char *buf); void kputs(char *buf);
int kprintf(const char *format, ...); int kprintf(const char *format, ...);
void kmsg_init(void); void kmsg_init(int);
#endif #endif

View File

@ -92,7 +92,8 @@ futex(
uint64_t timeout, uint64_t timeout,
uint32_t __user * uaddr2, uint32_t __user * uaddr2,
uint32_t val2, uint32_t val2,
uint32_t val3 uint32_t val3,
int fshared
); );
extern long extern long

View File

@ -47,6 +47,7 @@ typedef int memobj_get_page_func_t(struct memobj *obj, off_t off, int p2align, u
typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align); typedef uintptr_t memobj_copy_page_func_t(struct memobj *obj, uintptr_t orgphys, int p2align);
typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize); typedef int memobj_flush_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize); typedef int memobj_invalidate_page_func_t(struct memobj *obj, uintptr_t phys, size_t pgsize);
typedef int memobj_lookup_page_func_t(struct memobj *obj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag);
struct memobj_ops { struct memobj_ops {
memobj_release_func_t * release; memobj_release_func_t * release;
@ -55,6 +56,7 @@ struct memobj_ops {
memobj_copy_page_func_t * copy_page; memobj_copy_page_func_t * copy_page;
memobj_flush_page_func_t * flush_page; memobj_flush_page_func_t * flush_page;
memobj_invalidate_page_func_t * invalidate_page; memobj_invalidate_page_func_t * invalidate_page;
memobj_lookup_page_func_t * lookup_page;
}; };
static inline void memobj_release(struct memobj *obj) static inline void memobj_release(struct memobj *obj)
@ -106,6 +108,15 @@ static inline int memobj_invalidate_page(struct memobj *obj, uintptr_t phys,
return 0; return 0;
} }
static inline int memobj_lookup_page(struct memobj *obj, off_t off,
int p2align, uintptr_t *physp, unsigned long *pflag)
{
if (obj->ops->lookup_page) {
return (*obj->ops->lookup_page)(obj, off, p2align, physp, pflag);
}
return -ENXIO;
}
static inline void memobj_lock(struct memobj *obj) static inline void memobj_lock(struct memobj *obj)
{ {
ihk_mc_spinlock_lock_noirq(&obj->lock); ihk_mc_spinlock_lock_noirq(&obj->lock);

View File

@ -162,6 +162,8 @@
#define USER_STACK_NR_PAGES 8192 #define USER_STACK_NR_PAGES 8192
#define KERNEL_STACK_NR_PAGES 25 #define KERNEL_STACK_NR_PAGES 25
#define NOPHYS ((uintptr_t)-1)
#include <waitq.h> #include <waitq.h>
#include <futex.h> #include <futex.h>
@ -216,9 +218,11 @@ struct thread_hash {
struct address_space { struct address_space {
struct page_table *page_table; struct page_table *page_table;
int type; void *opt;
#define ADDRESS_SPACE_NORMAL 1 void (*free_cb)(struct address_space *, void *);
#define ADDRESS_SPACE_PVAS 2 ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int nslots; int nslots;
int pids[]; int pids[];
}; };
@ -288,7 +292,7 @@ struct user
unsigned long int u_debugreg [8]; unsigned long int u_debugreg [8];
}; };
#define AUXV_LEN 16 #define AUXV_LEN 18
struct vm_range { struct vm_range {
struct list_head list; struct list_head list;
@ -296,6 +300,8 @@ struct vm_range {
unsigned long flag; unsigned long flag;
struct memobj *memobj; struct memobj *memobj;
off_t objoff; off_t objoff;
int pgshift; /* page size. 0 means THP */
int padding;
}; };
struct vm_regions { struct vm_regions {
@ -310,18 +316,23 @@ struct vm_regions {
struct process_vm; struct process_vm;
struct sigfd { struct mckfd {
struct sigfd *next; struct mckfd *next;
int fd; int fd;
__sigset_t mask; long data;
void *opt;
long (*read_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*ioctl_cb)(struct mckfd *, ihk_mc_user_context_t *);
long (*mmap_cb)(struct mckfd *, ihk_mc_user_context_t *);
int (*close_cb)(struct mckfd *, ihk_mc_user_context_t *);
}; };
#define SFD_CLOEXEC 02000000 #define SFD_CLOEXEC 02000000
#define SFD_NONBLOCK 04000 #define SFD_NONBLOCK 04000
struct sig_common { struct sig_common {
ihk_spinlock_t lock; ihk_spinlock_t lock;
ihk_atomic_t use; ihk_atomic_t use;
struct sigfd *sigfd;
struct k_sigaction action[_NSIG]; struct k_sigaction action[_NSIG];
struct list_head sigpending; struct list_head sigpending;
}; };
@ -341,7 +352,7 @@ typedef void pgio_func_t(void *arg);
* special "init" process */ * special "init" process */
struct process { struct process {
struct list_head hash_list; struct list_head hash_list;
mcs_rwlock_lock_t update_lock; // lock for parent, status, ...? mcs_rwlock_lock_t update_lock; // lock for parent, status, cpu time...
// process vm // process vm
struct process_vm *vm; struct process_vm *vm;
@ -398,6 +409,7 @@ struct process {
int fsgid; int fsgid;
int execed; int execed;
int nohost; int nohost;
int nowait;
struct rlimit rlimit[MCK_RLIM_MAX]; struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long saved_auxv[AUXV_LEN]; unsigned long saved_auxv[AUXV_LEN];
char *saved_cmdline; char *saved_cmdline;
@ -422,6 +434,27 @@ struct process {
/* Store signal sent to parent when the process terminates. */ /* Store signal sent to parent when the process terminates. */
int termsig; int termsig;
ihk_spinlock_t mckfd_lock;
struct mckfd *mckfd;
// cpu time (summary)
struct timespec stime;
struct timespec utime;
// cpu time (children)
struct timespec stime_children;
struct timespec utime_children;
long maxrss;
long maxrss_children;
// perf_event
int perf_status;
#define PP_NONE 0
#define PP_RESET 1
#define PP_COUNT 2
#define PP_STOP 3
struct mc_perf_event *monitoring_event;
}; };
void hold_thread(struct thread *ftn); void hold_thread(struct thread *ftn);
@ -509,6 +542,20 @@ struct thread {
unsigned long *ptrace_debugreg; /* debug registers for ptrace */ unsigned long *ptrace_debugreg; /* debug registers for ptrace */
struct sig_pending *ptrace_recvsig; struct sig_pending *ptrace_recvsig;
struct sig_pending *ptrace_sendsig; struct sig_pending *ptrace_sendsig;
// cpu time
struct timespec stime;
struct timespec utime;
struct timespec btime;
int times_update;
int in_kernel;
// interval timers
int itimer_enabled;
struct itimerval itimer_virtual;
struct itimerval itimer_prof;
struct timespec itimer_virtual_value;
struct timespec itimer_prof_value;
}; };
struct process_vm { struct process_vm {
@ -516,6 +563,10 @@ struct process_vm {
struct list_head vm_range_list; struct list_head vm_range_list;
struct vm_regions region; struct vm_regions region;
struct process *proc; /* process that reside on the same page */ struct process *proc; /* process that reside on the same page */
void *opt;
void (*free_cb)(struct process_vm *, void *);
void *vdso_addr;
void *vvar_addr;
ihk_spinlock_t page_table_lock; ihk_spinlock_t page_table_lock;
ihk_spinlock_t memory_range_lock; ihk_spinlock_t memory_range_lock;
@ -526,12 +577,25 @@ struct process_vm {
// is protected by its own lock (see ihk/manycore/generic/page_alloc.c) // is protected by its own lock (see ihk/manycore/generic/page_alloc.c)
ihk_atomic_t refcount; ihk_atomic_t refcount;
cpu_set_t cpu_set;
ihk_spinlock_t cpu_set_lock;
int exiting; int exiting;
long currss;
}; };
static inline int has_cap_ipc_lock(struct thread *th)
{
/* CAP_IPC_LOCK (= 14) */
return !(th->proc->euid);
}
static inline int has_cap_sys_admin(struct thread *th)
{
/* CAP_SYS_ADMIN (= 21) */
return !(th->proc->euid);
}
void hold_address_space(struct address_space *);
void release_address_space(struct address_space *);
struct thread *create_thread(unsigned long user_pc); struct thread *create_thread(unsigned long user_pc);
struct thread *clone_thread(struct thread *org, unsigned long pc, struct thread *clone_thread(struct thread *org, unsigned long pc,
unsigned long sp, int clone_flags); unsigned long sp, int clone_flags);
@ -549,7 +613,7 @@ int populate_process_memory(struct process_vm *vm, void *start, size_t len);
int add_process_memory_range(struct process_vm *vm, int add_process_memory_range(struct process_vm *vm,
unsigned long start, unsigned long end, unsigned long start, unsigned long end,
unsigned long phys, unsigned long flag, unsigned long phys, unsigned long flag,
struct memobj *memobj, off_t objoff); struct memobj *memobj, off_t objoff, int pgshift);
int remove_process_memory_range(struct process_vm *vm, unsigned long start, int remove_process_memory_range(struct process_vm *vm, unsigned long start,
unsigned long end, int *ro_freedp); unsigned long end, int *ro_freedp);
int split_process_memory_range(struct process_vm *vm, int split_process_memory_range(struct process_vm *vm,
@ -610,5 +674,6 @@ void process_unlock(struct process *proc, struct mcs_rwlock_node_irqsave *lock);
void chain_process(struct process *); void chain_process(struct process *);
void chain_thread(struct thread *); void chain_thread(struct thread *);
void proc_init(); void proc_init();
void set_timer();
#endif #endif

View File

@ -25,6 +25,7 @@ enum {
IPC_CREAT = 01000, IPC_CREAT = 01000,
IPC_EXCL = 02000, IPC_EXCL = 02000,
SHM_HUGETLB = 04000,
SHM_RDONLY = 010000, SHM_RDONLY = 010000,
SHM_RND = 020000, SHM_RND = 020000,
SHM_REMAP = 040000, SHM_REMAP = 040000,
@ -46,11 +47,14 @@ enum {
SHM_INFO = 14, SHM_INFO = 14,
}; };
struct shmlock_user;
struct shmobj { struct shmobj {
struct memobj memobj; /* must be first */ struct memobj memobj; /* must be first */
int index; int index;
uint8_t padding[4]; int pgshift;
size_t real_segsz; size_t real_segsz;
struct shmlock_user * user;
struct shmid_ds ds; struct shmid_ds ds;
struct list_head page_list; struct list_head page_list;
struct list_head chain; /* shmobj_list */ struct list_head chain; /* shmobj_list */
@ -75,9 +79,33 @@ struct shm_info {
uint64_t swap_successes; uint64_t swap_successes;
}; };
struct shmlock_user {
uid_t ruid;
int padding;
size_t locked;
struct list_head chain;
};
extern ihk_spinlock_t shmlock_users_lock_body;
static inline void shmlock_users_lock(void)
{
ihk_mc_spinlock_lock_noirq(&shmlock_users_lock_body);
return;
}
static inline void shmlock_users_unlock(void)
{
ihk_mc_spinlock_unlock_noirq(&shmlock_users_lock_body);
return;
}
void shmobj_list_lock(void); void shmobj_list_lock(void);
void shmobj_list_unlock(void); void shmobj_list_unlock(void);
int shmobj_create_indexed(struct shmid_ds *ds, struct shmobj **objp); int shmobj_create_indexed(struct shmid_ds *ds, struct shmobj **objp);
void shmobj_destroy(struct shmobj *obj); void shmobj_destroy(struct shmobj *obj);
void shmlock_user_free(struct shmlock_user *user);
int shmlock_user_get(uid_t ruid, struct shmlock_user **userp);
#endif /* HEADER_SHM_H */ #endif /* HEADER_SHM_H */

View File

@ -38,6 +38,10 @@
#define SCD_MSG_SYSCALL_ONESIDE 0x4 #define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8 #define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9 #define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
#define SCD_MSG_GET_CPU_MAPPING 0xc
#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10 #define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11 #define SCD_MSG_PROCFS_DELETE 0x11
@ -46,10 +50,28 @@
#define SCD_MSG_DEBUG_LOG 0x20 #define SCD_MSG_DEBUG_LOG 0x20
#define ARCH_SET_GS 0x1001 #define SCD_MSG_SYSFS_REQ_CREATE 0x30
#define ARCH_SET_FS 0x1002 /* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define ARCH_GET_FS 0x1003 #define SCD_MSG_SYSFS_REQ_MKDIR 0x32
#define ARCH_GET_GS 0x1004 /* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
/* Cloning flags. */ /* Cloning flags. */
# define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */ # define CSIGNAL 0x000000ff /* Signal mask to be sent at exit. */
@ -94,13 +116,27 @@ struct user_desc {
unsigned int useable:1; unsigned int useable:1;
unsigned int lm:1; unsigned int lm:1;
}; };
struct ikc_scd_packet { struct ikc_scd_packet {
int msg; int msg;
int ref;
int osnum;
int pid;
int err; int err;
unsigned long arg; union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
int padding;
unsigned long arg;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
};
}; };
struct program_image_section { struct program_image_section {
@ -143,6 +179,9 @@ struct program_load_desc {
int stack_prot; int stack_prot;
int pgid; int pgid;
int cred[8]; int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry; unsigned long entry;
unsigned long user_start; unsigned long user_start;
unsigned long user_end; unsigned long user_end;
@ -278,6 +317,7 @@ struct procfs_read {
int ret; /* read bytes (answer) */ int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */ int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */ int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */ char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
}; };
@ -287,6 +327,29 @@ struct procfs_file {
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */ char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
}; };
#define RUSAGE_SELF 0
#define RUSAGE_CHILDREN -1
#define RUSAGE_THREAD 1
struct rusage {
struct timeval ru_utime;
struct timeval ru_stime;
long ru_maxrss;
long ru_ixrss;
long ru_idrss;
long ru_isrss;
long ru_minflt;
long ru_majflt;
long ru_nswap;
long ru_inblock;
long ru_oublock;
long ru_msgsnd;
long ru_msgrcv;
long ru_nsignals;
long ru_nvcsw;
long ru_nivcsw;
};
extern void terminate(int, int); extern void terminate(int, int);
struct tod_data_s { struct tod_data_s {
@ -298,4 +361,50 @@ struct tod_data_s {
}; };
extern struct tod_data_s tod_data; /* residing in arch-dependent file */ extern struct tod_data_s tod_data; /* residing in arch-dependent file */
void reset_cputime();
void set_cputime(int mode);
intptr_t do_mmap(intptr_t addr0, size_t len0, int prot, int flags, int fd,
off_t off0);
void clear_host_pte(uintptr_t addr, size_t len);
typedef int32_t key_t;
int do_shmget(key_t key, size_t size, int shmflg);
struct process_vm;
int arch_map_vdso(struct process_vm *vm); /* arch dependent */
int arch_setup_vdso(void);
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
struct cpu_mapping {
int cpu_number;
int hw_id;
};
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
#if 0
wait_queue_head_t wq;
#endif
};
#endif #endif

71
kernel/include/sysfs.h Normal file
View File

@ -0,0 +1,71 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_H
#define MCKERNEL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_ops {
ssize_t (*show)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfs_ops *ops, void *instance, void *buf,
size_t bufsize);
void (*release)(struct sysfs_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfs_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
const char *fmt, ...);
extern int sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...);
extern int sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...);
extern int sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...);
extern int sysfs_unlinkf(int flags, const char *fmt, ...);
extern void sysfs_init(void);
struct ihk_ikc_channel_desc;
extern void sysfss_packet_handler(struct ihk_ikc_channel_desc *ch, int msg,
int error, long arg1, long arg2, long arg3);
#endif /* MCKERNEL_SYSFS_H */

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -20,6 +20,10 @@
#define __TIME_H #define __TIME_H
#define NS_PER_SEC 1000000000UL #define NS_PER_SEC 1000000000UL
#define CLOCK_REALTIME 0
#define CLOCK_MONOTONIC 1
#define CLOCK_PROCESS_CPUTIME_ID 2
#define CLOCK_THREAD_CPUTIME_ID 3
typedef long int __time_t; typedef long int __time_t;
@ -49,5 +53,72 @@ struct timezone
int tz_dsttime; /* Nonzero if DST is ever in effect. */ int tz_dsttime; /* Nonzero if DST is ever in effect. */
}; };
#define ITIMER_REAL 0
#define ITIMER_VIRTUAL 1
#define ITIMER_PROF 2
struct itimerval {
struct timeval it_interval;
struct timeval it_value;
};
static inline void
ts_add(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_nsec += bts->tv_nsec;
while(ats->tv_nsec >= 1000000000){
ats->tv_sec++;
ats->tv_nsec -= 1000000000;
}
}
static inline void
ts_sub(struct timespec *ats, const struct timespec *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_nsec -= bts->tv_nsec;
while(ats->tv_nsec < 0){
ats->tv_sec--;
ats->tv_nsec += 1000000000;
}
}
static inline void
tv_add(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec += bts->tv_sec;
ats->tv_usec += bts->tv_usec;
while(ats->tv_usec >= 1000000){
ats->tv_sec++;
ats->tv_usec -= 1000000;
}
}
static inline void
tv_sub(struct timeval *ats, const struct timeval *bts)
{
ats->tv_sec -= bts->tv_sec;
ats->tv_usec -= bts->tv_usec;
while(ats->tv_usec < 0){
ats->tv_sec--;
ats->tv_usec += 1000000;
}
}
static inline void
tv_to_ts(struct timespec *ats, const struct timeval *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_nsec = bts->tv_usec * 1000;
}
static inline void
ts_to_tv(struct timeval *ats, const struct timespec *bts)
{
ats->tv_sec = bts->tv_sec;
ats->tv_usec = bts->tv_nsec / 1000;
}
#endif // __TIME_H #endif // __TIME_H

View File

@ -30,6 +30,7 @@
#include <init.h> #include <init.h>
#include <cls.h> #include <cls.h>
#include <syscall.h> #include <syscall.h>
#include <sysfs.h>
//#define IOCTL_FUNC_EXTENSION //#define IOCTL_FUNC_EXTENSION
#ifdef IOCTL_FUNC_EXTENSION #ifdef IOCTL_FUNC_EXTENSION
@ -207,6 +208,7 @@ static void time_init(void)
{ {
unsigned long tv_sec, tv_nsec; unsigned long tv_sec, tv_nsec;
unsigned long ns_per_kclock; unsigned long ns_per_kclock;
unsigned long tsc;
ihk_mc_get_boot_time(&tv_sec, &tv_nsec); ihk_mc_get_boot_time(&tv_sec, &tv_nsec);
ns_per_kclock = ihk_mc_get_ns_per_tsc(); ns_per_kclock = ihk_mc_get_ns_per_tsc();
@ -216,6 +218,15 @@ static void time_init(void)
if (ns_per_kclock) { if (ns_per_kclock) {
tod_data.clocks_per_sec = (1000L * NS_PER_SEC) / ns_per_kclock; tod_data.clocks_per_sec = (1000L * NS_PER_SEC) / ns_per_kclock;
tsc = rdtsc();
tod_data.origin.tv_sec -= tsc / tod_data.clocks_per_sec;
tod_data.origin.tv_nsec -= NS_PER_SEC * (tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
if (tod_data.origin.tv_nsec < 0) {
--tod_data.origin.tv_sec;
tod_data.origin.tv_nsec += NS_PER_SEC;
}
} }
if (!ns_per_kclock) { if (!ns_per_kclock) {
@ -250,9 +261,70 @@ static void rest_init(void)
sched_init(); sched_init();
} }
static void setup_remote_snooping_samples(void)
{
static long lvalue = 0xf123456789abcde0;
static char *svalue = "string(remote)";
int error;
struct sysfs_bitmap_param param;
error = sysfs_createf(SYSFS_SNOOPING_OPS_d32, &lvalue, 0444, "/sys/test/remote/d32");
if (error) {
panic("setup_remote_snooping_samples: d32");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_d64, &lvalue, 0444, "/sys/test/remote/d64");
if (error) {
panic("setup_remote_snooping_samples: d64");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u32, &lvalue, 0444, "/sys/test/remote/u32");
if (error) {
panic("setup_remote_snooping_samples: u32");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u64, &lvalue, 0444, "/sys/test/remote/u64");
if (error) {
panic("setup_remote_snooping_samples: u64");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_s, svalue, 0444, "/sys/test/remote/s");
if (error) {
panic("setup_remote_snooping_samples: s");
}
param.nbits = 40;
param.ptr = &lvalue;
error = sysfs_createf(SYSFS_SNOOPING_OPS_pbl, &param, 0444, "/sys/test/remote/pbl");
if (error) {
panic("setup_remote_snooping_samples: pbl");
}
param.nbits = 40;
param.ptr = &lvalue;
error = sysfs_createf(SYSFS_SNOOPING_OPS_pb, &param, 0444, "/sys/test/remote/pb");
if (error) {
panic("setup_remote_snooping_samples: pb");
}
error = sysfs_createf(SYSFS_SNOOPING_OPS_u32K, &lvalue, 0444, "/sys/test/remote/u32K");
if (error) {
panic("setup_remote_snooping_samples: u32K");
}
return;
} /* setup_remote_snooping_samples() */
static void populate_sysfs(void)
{
cpu_sysfs_setup();
setup_remote_snooping_samples();
} /* populate_sysfs() */
int host_ikc_inited = 0; int host_ikc_inited = 0;
extern int num_processors; extern int num_processors;
extern void zero_tsc(void);
static void post_init(void) static void post_init(void)
{ {
@ -271,13 +343,12 @@ static void post_init(void)
ihk_mc_spinlock_init(&syscall_lock); ihk_mc_spinlock_init(&syscall_lock);
} }
/* Zero TSC. arch_setup_vdso();
* All AP cores are wait spinning for ap_start() and they will zero arch_start_pvclock();
* their TSC immediatly. */
zero_tsc();
ap_start(); ap_start();
create_os_procfs_files(); sysfs_init();
populate_sysfs();
} }
#ifdef DCFA_RUN #ifdef DCFA_RUN
extern void user_main(); extern void user_main();
@ -290,7 +361,15 @@ extern void ibmic_cmd_init(void);
int main(void) int main(void)
{ {
kmsg_init(); char *ptr;
int mode = 0;
ptr = find_command_line("ksyslogd=");
if (ptr) {
mode = ptr[9] - 0x30;
if (mode < 0 || mode > 2) mode = 0;
}
kmsg_init(mode);
kputs("MCK started.\n"); kputs("MCK started.\n");

View File

@ -17,6 +17,7 @@
#include <ihk/debug.h> #include <ihk/debug.h>
#include <ihk/ikc.h> #include <ihk/ikc.h>
#include <ikc/master.h> #include <ikc/master.h>
#include <arch/cpu.h>
//#define DEBUG_LISTENERS //#define DEBUG_LISTENERS
@ -28,16 +29,6 @@
#define ekprintf(...) kprintf(__VA_ARGS__) #define ekprintf(...) kprintf(__VA_ARGS__)
#endif #endif
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
void testmem(void *v, unsigned long size) void testmem(void *v, unsigned long size)
{ {
unsigned long i, st, ed, s = 0; unsigned long i, st, ed, s = 0;

View File

@ -52,7 +52,8 @@ static struct ihk_page_allocator_desc *pa_allocator;
static unsigned long pa_start, pa_end; static unsigned long pa_start, pa_end;
static struct page *pa_pages; static struct page *pa_pages;
extern int ihk_mc_pt_print_pte(struct page_table *pt, void *virt); extern void unhandled_page_fault(struct thread *, void *, void *);
extern int interrupt_from_user(void *);
struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE]; struct tlb_flush_entry tlb_flush_vector[IHK_TLB_FLUSH_IRQ_VECTOR_SIZE];
@ -209,61 +210,6 @@ void coredump(struct thread *thread, void *regs)
freecore(&coretable); freecore(&coretable);
} }
static void unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
{
const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
irqflags = kprintf_lock();
dkprintf("[%d] Page fault for 0x%lX\n",
ihk_mc_get_processor_id(), address);
dkprintf("%s for %s access in %s mode (reserved bit %s set), "
"it %s an instruction fetch\n",
(error & PF_PROT ? "protection fault" : "no page found"),
(error & PF_WRITE ? "write" : "read"),
(error & PF_USER ? "user" : "kernel"),
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
dkprintf("address is in range, flag: 0x%X! \n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
dkprintf("address is out of range! \n");
}
kprintf_unlock(irqflags);
/* TODO */
ihk_mc_debug_show_interrupt_context(regs);
//dkprintf("now dump a core file\n");
//coredump(proc, regs);
#ifdef DEBUG_PRINT_MEM
{
uint64_t *sp = (void *)REGS_GET_STACK_POINTER(regs);
kprintf("*rsp:%lx,*rsp+8:%lx,*rsp+16:%lx,*rsp+24:%lx,\n",
sp[0], sp[1], sp[2], sp[3]);
}
#endif
return;
}
void remote_flush_tlb_cpumask(struct process_vm *vm, void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id) unsigned long addr, int cpu_id)
{ {
@ -285,9 +231,9 @@ void remote_flush_tlb_cpumask(struct process_vm *vm,
/* Take a copy of the cpu set so that we don't hold the lock /* Take a copy of the cpu set so that we don't hold the lock
* all the way while interrupting other cores */ * all the way while interrupting other cores */
ihk_mc_spinlock_lock_noirq(&vm->cpu_set_lock); ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
memcpy(&_cpu_set, &vm->cpu_set, sizeof(cpu_set_t)); memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
ihk_mc_spinlock_unlock_noirq(&vm->cpu_set_lock); ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind); dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
@ -369,6 +315,7 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
struct thread *thread = cpu_local_var(current); struct thread *thread = cpu_local_var(current);
int error; int error;
set_cputime(interrupt_from_user(regs)? 1: 2);
dkprintf("[%d]page_fault_handler(%p,%lx,%p)\n", dkprintf("[%d]page_fault_handler(%p,%lx,%p)\n",
ihk_mc_get_processor_id(), fault_addr, reason, regs); ihk_mc_get_processor_id(), fault_addr, reason, regs);
@ -416,7 +363,10 @@ static void page_fault_handler(void *fault_addr, uint64_t reason, void *regs)
info._sifields._sigfault.si_addr = fault_addr; info._sifields._sigfault.si_addr = fault_addr;
set_signal(SIGSEGV, regs, &info); set_signal(SIGSEGV, regs, &info);
} }
check_signal(0, regs, 0); if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
}
goto out; goto out;
} }
@ -427,6 +377,7 @@ out:
ihk_mc_get_processor_id(), fault_addr, reason, ihk_mc_get_processor_id(), fault_addr, reason,
regs, error); regs, error);
check_need_resched(); check_need_resched();
set_cputime(0);
return; return;
} }
@ -931,7 +882,6 @@ void kmalloc_init(void)
{ {
struct cpu_local_var *v = get_this_cpu_local_var(); struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list; struct malloc_header *h = &v->free_list;
ihk_mc_spinlock_init(&v->free_list_lock);
int i; int i;
h->check = 0x5a5a5a5a; h->check = 0x5a5a5a5a;
@ -950,80 +900,11 @@ void kmalloc_init(void)
ihk_mc_spinlock_init(&alloclock); ihk_mc_spinlock_init(&alloclock);
} }
void ____kfree(struct cpu_local_var *v, struct malloc_header *p)
void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
{ {
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list, *prev, *p;
int u, req_page;
unsigned long flags;
if (size >= PAGE_SIZE * 4) {
return NULL;
}
u = (size + sizeof(*h) - 1) / sizeof(*h);
flags = ihk_mc_spinlock_lock(&v->free_list_lock);
prev = h;
h = h->next;
while (1) {
if (h == &v->free_list) {
req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
h = allocate_pages(req_page, flag);
if(h == NULL) {
kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag);
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return NULL;
}
h->check = 0x5a5a5a5a;
prev->next = h;
h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2;
/* Guard entry */
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->next = &v->free_list;
p->size = 0;
h->next = p;
}
if (h->size >= u) {
if (h->size == u || h->size == u + 1) {
prev->next = h->next;
h->cpu_id = ihk_mc_get_processor_id();
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return h + 1;
} else { /* Divide */
h->size -= u + 1;
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->size = u;
p->cpu_id = ihk_mc_get_processor_id();
ihk_mc_spinlock_unlock(&v->free_list_lock, flags);
return p + 1;
}
}
prev = h;
h = h->next;
}
}
void ___kfree(void *ptr)
{
struct malloc_header *p = (struct malloc_header *)ptr;
struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id);
struct malloc_header *h = &v->free_list; struct malloc_header *h = &v->free_list;
int combined = 0; int combined = 0;
unsigned long flags;
flags = ihk_mc_spinlock_lock(&v->free_list_lock);
h = h->next; h = h->next;
while ((p < h || p > h->next) && h != &v->free_list) { while ((p < h || p > h->next) && h != &v->free_list) {
@ -1050,7 +931,94 @@ void ___kfree(void *ptr)
p->next = h->next; p->next = h->next;
h->next = p; h->next = p;
} }
ihk_mc_spinlock_unlock(&v->free_list_lock, flags); }
void *___kmalloc(int size, enum ihk_mc_ap_flag flag)
{
struct cpu_local_var *v = get_this_cpu_local_var();
struct malloc_header *h = &v->free_list, *prev, *p;
int u, req_page;
p = (struct malloc_header *)xchg8((unsigned long *)&v->remote_free_list, 0L);
while(p){
struct malloc_header *n = p->next;
____kfree(v, p);
p = n;
}
if (size >= PAGE_SIZE * 4) {
return NULL;
}
u = (size + sizeof(*h) - 1) / sizeof(*h);
prev = h;
h = h->next;
while (1) {
if (h == &v->free_list) {
req_page = ((u + 2) * sizeof(*h) + PAGE_SIZE - 1)
>> PAGE_SHIFT;
h = allocate_pages(req_page, flag);
if(h == NULL) {
kprintf("kmalloc(%#x,%#x): out of memory\n", size, flag);
return NULL;
}
h->check = 0x5a5a5a5a;
prev->next = h;
h->size = (req_page * PAGE_SIZE) / sizeof(*h) - 2;
/* Guard entry */
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->next = &v->free_list;
p->size = 0;
h->next = p;
}
if (h->size >= u) {
if (h->size == u || h->size == u + 1) {
prev->next = h->next;
h->cpu_id = ihk_mc_get_processor_id();
return h + 1;
} else { /* Divide */
h->size -= u + 1;
p = h + h->size + 1;
p->check = 0x5a5a5a5a;
p->size = u;
p->cpu_id = ihk_mc_get_processor_id();
return p + 1;
}
}
prev = h;
h = h->next;
}
}
void ___kfree(void *ptr)
{
struct malloc_header *p = (struct malloc_header *)ptr;
struct cpu_local_var *v = get_cpu_local_var((--p)->cpu_id);
if(p->cpu_id == ihk_mc_get_processor_id()){
____kfree(v, p);
}
else{
unsigned long oldval;
unsigned long newval;
unsigned long rval;
do{
p->next = v->remote_free_list;
oldval = (unsigned long)p->next;
newval = (unsigned long)p;
rval = atomic_cmpxchg8(
(unsigned long *)&v->remote_free_list,
oldval, newval);
}while(rval != oldval);
}
} }
void print_free_list(void) void print_free_list(void)

File diff suppressed because it is too large Load Diff

View File

@ -38,214 +38,34 @@ extern int sscanf(const char * buf, const char * fmt, ...);
extern int osnum; extern int osnum;
void create_proc_procfs_files(int pid, int cpuid); static void
void delete_proc_procfs_files(int pid); procfs_thread_ctl(struct thread *thread, int msg)
void create_os_procfs_files(void);
void delete_os_procfs_files(void);
static void create_proc_procfs_file(int pid, char *fname, int mode, int cpuid);
static void delete_proc_procfs_file(int pid, char *fname);
static void operate_proc_procfs_file(int pid, char *fname, int msg, int mode, int cpuid);
int copy_from_user(void *dst, const void *src, size_t siz);
int copy_to_user(void *dst, const void *src, size_t siz);
/**
* \brief Create all procfs files for process.
*
* \param pid pid of the process
* \param cpuid cpuid of the process
*/
void create_proc_procfs_files(int pid, int cpuid)
{
char fname[PROCFS_NAME_MAX];
dprintf("create procfs files:\n");
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/auxv", osnum, pid);
create_proc_procfs_file(pid, fname, 0400, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/cmdline", osnum, pid);
create_proc_procfs_file(pid, fname, 0444, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/mem", osnum, pid);
create_proc_procfs_file(pid, fname, 0400, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/maps", osnum, pid);
create_proc_procfs_file(pid, fname, 0444, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/pagemap", osnum, pid);
create_proc_procfs_file(pid, fname, 0444, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/status", osnum, pid);
create_proc_procfs_file(pid, fname, 0444, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task/%d/mem", osnum, pid, pid);
create_proc_procfs_file(pid, fname, 0400, cpuid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task/%d/stat", osnum, pid, pid);
create_proc_procfs_file(pid, fname, 0444, cpuid);
dprintf("create procfs files: done\n");
}
/**
* \brief Create a procfs file for process.
*
* \param pid pid of the process
* \param fname file name of the procfs file
* \param mode file mode
* \param cpuid cpuid of the process
*/
static void create_proc_procfs_file(int pid, char *fname, int mode, int cpuid)
{
dprintf("create procfs file: %s, mode: %o, cpuid: %d\n", fname, mode, cpuid);
operate_proc_procfs_file(pid, fname, SCD_MSG_PROCFS_CREATE, mode, cpuid);
}
/**
* \brief Delete all procfs files for process.
*
* \param pid pid of the process
*/
void delete_proc_procfs_files(int pid)
{
char fname[PROCFS_NAME_MAX];
dprintf("delete procfs files for pid %d.\n", pid);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task/%d/mem", osnum, pid, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task/%d/stat", osnum, pid, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task/%d", osnum, pid, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/task", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/mem", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/maps", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/status", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/pagemap", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/cmdline", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d/auxv", osnum, pid);
delete_proc_procfs_file(pid, fname);
snprintf(fname, PROCFS_NAME_MAX, "mcos%d/%d", osnum, pid);
delete_proc_procfs_file(pid, fname);
dprintf("delete procfs files for pid %d: done\n", pid);
}
/**
* \brief Delete a procfs file for process.
*
* \param pid pid of the process
* \param fname file name of the procfs file
*/
static void delete_proc_procfs_file(int pid, char *fname)
{
dprintf("delete procfs file: %s\n", fname);
operate_proc_procfs_file(pid, fname, SCD_MSG_PROCFS_DELETE, 0, 0);
dprintf("delete procfs file: %s done\n", fname);
}
/**
* \brief create a procfs file for this operating system
* \param fname relative path name from "host:/proc".
* \param mode permissions of the file to be created
*
* Though operate_proc_procfs_file() is intended to create a process
* specific file, it is reused to create a OS specific file by
* specifying -1 as the pid parameter.
*/
static void create_os_procfs_file(char *fname, int mode)
{
const pid_t pid = -1;
const int msg = SCD_MSG_PROCFS_CREATE;
const int cpuid = ihk_mc_get_processor_id(); /* i.e. BSP */
operate_proc_procfs_file(pid, fname, msg, mode, cpuid);
return;
}
/**
* \brief create all procfs files for this operating system
*/
void create_os_procfs_files(void)
{
char *fname = NULL;
size_t n;
fname = kmalloc(PROCFS_NAME_MAX, IHK_MC_AP_CRITICAL);
n = snprintf(fname, PROCFS_NAME_MAX, "mcos%d/stat", osnum);
if (n >= PROCFS_NAME_MAX) panic("/proc/stat");
create_os_procfs_file(fname, 0444);
return;
}
/**
* \brief Create/delete a procfs file for process.
*
* \param pid pid of the process
* \param fname file name of the procfs file
* \param msg message (create/delete)
* \param mode file mode
* \param cpuid cpuid of the process
*/
static void operate_proc_procfs_file(int pid, char *fname, int msg, int mode, int cpuid)
{ {
struct ihk_ikc_channel_desc *syscall_channel; struct ihk_ikc_channel_desc *syscall_channel;
struct ikc_scd_packet pckt; struct ikc_scd_packet packet;
struct procfs_file *f;
int ret;
syscall_channel = cpu_local_var(syscall_channel); syscall_channel = cpu_local_var(syscall_channel);
memset(&packet, '\0', sizeof packet);
packet.arg = thread->tid;
packet.msg = msg;
packet.osnum = osnum;
packet.ref = thread->cpu_id;
packet.pid = thread->proc->pid;
packet.err = 0;
f = kmalloc(sizeof(struct procfs_file), IHK_MC_AP_NOWAIT); ihk_ikc_send(syscall_channel, &packet, 0);
if (!f) { }
kprintf("ERROR: not enough memory for dealing procfs file %s!",
fname);
return;
}
f->status = 0;
f->mode = mode;
strncpy(f->fname, fname, PROCFS_NAME_MAX);
pckt.arg = virt_to_phys(f);
pckt.msg = msg;
pckt.osnum = osnum;
pckt.ref = cpuid;
pckt.pid = pid;
pckt.err = 0;
ret = ihk_ikc_send(syscall_channel, &pckt, 0); void
if (ret < 0) { procfs_create_thread(struct thread *thread)
kprintf("ERROR: sending IKC msg, ret: %d\n", ret); {
} procfs_thread_ctl(thread, SCD_MSG_PROCFS_TID_CREATE);
}
while (f->status != 1) { void
cpu_pause(); procfs_delete_thread(struct thread *thread)
} {
kfree(f); procfs_thread_ctl(thread, SCD_MSG_PROCFS_TID_DELETE);
} }
/** /**
@ -253,12 +73,13 @@ static void operate_proc_procfs_file(int pid, char *fname, int msg, int mode, in
* *
* \param rarg returned argument * \param rarg returned argument
*/ */
void
void process_procfs_request(unsigned long rarg) process_procfs_request(unsigned long rarg)
{ {
unsigned long parg, pbuf; unsigned long parg, pbuf;
struct thread *thread = cpu_local_var(current); struct thread *thread = NULL;
struct process *proc = thread->proc; struct process *proc = NULL;
struct process_vm *vm = NULL;
struct procfs_read *r; struct procfs_read *r;
struct ikc_scd_packet packet; struct ikc_scd_packet packet;
int rosnum, ret, pid, tid, ans = -EIO, eof = 0; int rosnum, ret, pid, tid, ans = -EIO, eof = 0;
@ -268,7 +89,7 @@ void process_procfs_request(unsigned long rarg)
unsigned long offset; unsigned long offset;
int count; int count;
int npages; int npages;
int is_current = 1; /* is 'proc' same as 'current'? */ int readwrite = 0;
dprintf("process_procfs_request: invoked.\n"); dprintf("process_procfs_request: invoked.\n");
@ -298,6 +119,7 @@ void process_procfs_request(unsigned long rarg)
goto bufunavail; goto bufunavail;
} }
readwrite = r->readwrite;
count = r->count; count = r->count;
offset = r->offset; offset = r->offset;
dprintf("fname: %s, offset: %lx, count:%d.\n", r->fname, r->offset, r->count); dprintf("fname: %s, offset: %lx, count:%d.\n", r->fname, r->offset, r->count);
@ -336,32 +158,47 @@ void process_procfs_request(unsigned long rarg)
*/ */
ret = sscanf(p, "%d/", &pid); ret = sscanf(p, "%d/", &pid);
if (ret == 1) { if (ret == 1) {
if (pid != cpu_local_var(current)->proc->pid) { struct mcs_rwlock_node tlock;
/* We are not located in the proper cpu for some reason. */ int tids;
struct thread *thread1 = NULL;
dprintf("mismatched pid. We are %d, but requested pid is %d.\n", proc = find_process(pid, &lock);
pid, cpu_local_var(current)->pid); if(proc == NULL){
tid = pid; /* main thread */ kprintf("process_procfs_request: no such pid %d\n", pid);
thread = find_thread(pid, tid, &lock); goto end;
if (!thread) {
dprintf("We cannot find the proper cpu for requested pid.\n");
goto end;
}
else if (thread->cpu_id != ihk_mc_get_processor_id()) {
/* The target process has gone by migration. */
r->newcpu = thread->cpu_id;
dprintf("expected cpu id is %d.\n", thread->cpu_id);
thread_unlock(thread, &lock);
ans = 0;
goto end;
}
else {
thread_unlock(thread, &lock);
/* 'proc' is not 'current' */
is_current = 0;
}
proc = thread->proc;
} }
p = strchr(p, '/') + 1;
if((tids = sscanf(p, "task/%d/", &tid)) == 1){
p = strchr(p, '/') + 1;
p = strchr(p, '/') + 1;
}
else
tid = pid;
mcs_rwlock_reader_lock_noirq(&proc->threads_lock, &tlock);
list_for_each_entry(thread, &proc->threads_list, siblings_list){
if(thread->tid == tid)
break;
if(!thread1)
thread1 = thread;
}
if(thread == NULL){
kprintf("process_procfs_request: no such tid %d-%d\n", pid, tid);
if(tids){
process_unlock(proc, &lock);
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
goto end;
}
thread = thread1;
}
if(thread)
hold_thread(thread);
mcs_rwlock_reader_unlock_noirq(&proc->threads_lock, &tlock);
hold_process(proc);
vm = proc->vm;
if(vm)
hold_process_vm(vm);
process_unlock(proc, &lock);
} }
else if (!strcmp(p, "stat")) { /* "/proc/stat" */ else if (!strcmp(p, "stat")) { /* "/proc/stat" */
extern int num_processors; /* kernel/ap.c */ extern int num_processors; /* kernel/ap.c */
@ -392,10 +229,9 @@ void process_procfs_request(unsigned long rarg)
goto end; goto end;
} }
else { else {
kprintf("unsupported procfs entry: %s\n", p);
goto end; goto end;
} }
dprintf("matched PID: %d.\n", pid);
p = strchr(p, '/') + 1;
/* /*
* mcos%d/PID/mem * mcos%d/PID/mem
@ -404,74 +240,55 @@ void process_procfs_request(unsigned long rarg)
* of the process. The count is the length of the area. * of the process. The count is the length of the area.
*/ */
if (strcmp(p, "mem") == 0) { if (strcmp(p, "mem") == 0) {
struct vm_range *range; uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
struct process_vm *vm = proc->vm; unsigned long offset = r->offset;
unsigned long left = r->count;
int ret;
struct page_table *pt = vm->address_space->page_table;
if (!is_current) { ans = 0;
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER; if(left == 0)
unsigned long offset = r->offset; goto end;
unsigned long left = r->count;
int ret;
ans = 0; #if 0
if(left == 0) if(!(proc->ptrace & PT_TRACED) ||
goto end; !(proc->status & (PS_STOPPED | PS_TRACED))){
ans = -EIO;
while(left){ goto end;
unsigned long pa;
char *va;
int pos = offset & (PAGE_SIZE - 1);
int size = PAGE_SIZE - pos;
if(size > left)
size = left;
ret = page_fault_process_vm(proc->vm,
(void *)offset, reason);
if(ret){
if(ans == 0)
ans = -EIO;
goto end;
}
ret = ihk_mc_pt_virt_to_phys(vm->address_space->page_table,
(void *)offset, &pa);
if(ret){
if(ans == 0)
ans = -EIO;
goto end;
}
va = phys_to_virt(pa);
memcpy(buf + ans, va, size);
offset += size;
left -= size;
ans += size;
}
} }
else{ #endif
unsigned long offset = r->offset;
unsigned long left = r->count; if(readwrite == 0)
unsigned long pos; reason = PF_POPULATE | PF_USER;
unsigned long l;
ans = 0; while(left){
list_for_each_entry(range, &vm->vm_range_list, list) { unsigned long pa;
dprintf("range: %lx - %lx\n", range->start, range->end); char *va;
while (left && int pos = offset & (PAGE_SIZE - 1);
(range->start <= offset) && int size = PAGE_SIZE - pos;
(offset < range->end)) {
pos = offset & (PAGE_SIZE - 1); if(size > left)
l = PAGE_SIZE - pos; size = left;
if(l > left) ret = page_fault_process_vm(vm, (void *)offset, reason);
l = left; if(ret){
if(copy_from_user(buf, (void *)offset, l)){ if(ans == 0)
if(ans == 0) ans = -EIO;
ans = -EIO; goto end;
goto end;
}
buf += l;
ans += l;
offset += l;
left -= l;
}
} }
ret = ihk_mc_pt_virt_to_phys(pt, (void *)offset, &pa);
if(ret){
if(ans == 0)
ans = -EIO;
goto end;
}
va = phys_to_virt(pa);
if(readwrite)
memcpy(va, buf + ans, size);
else
memcpy(buf + ans, va, size);
offset += size;
left -= size;
ans += size;
} }
goto end; goto end;
} }
@ -481,7 +298,6 @@ void process_procfs_request(unsigned long rarg)
*/ */
if (strcmp(p, "maps") == 0) { if (strcmp(p, "maps") == 0) {
struct vm_range *range; struct vm_range *range;
struct process_vm *vm = proc->vm;
int left = r->count - 1; /* extra 1 for terminating NULL */ int left = r->count - 1; /* extra 1 for terminating NULL */
int written = 0; int written = 0;
char *_buf = buf; char *_buf = buf;
@ -539,7 +355,6 @@ void process_procfs_request(unsigned long rarg)
* mcos%d/PID/pagemap * mcos%d/PID/pagemap
*/ */
if (strcmp(p, "pagemap") == 0) { if (strcmp(p, "pagemap") == 0) {
struct process_vm *vm = proc->vm;
uint64_t *_buf = (uint64_t *)buf; uint64_t *_buf = (uint64_t *)buf;
uint64_t start, end; uint64_t start, end;
@ -622,7 +437,7 @@ void process_procfs_request(unsigned long rarg)
* mcos%d/PID/auxv * mcos%d/PID/auxv
*/ */
if (strcmp(p, "auxv") == 0) { if (strcmp(p, "auxv") == 0) {
unsigned int limit = AUXV_LEN * sizeof(int); unsigned int limit = AUXV_LEN * sizeof(unsigned long);
unsigned int len = r->count; unsigned int len = r->count;
if (r->offset < limit) { if (r->offset < limit) {
if (limit < r->offset + r->count) { if (limit < r->offset + r->count) {
@ -675,112 +490,71 @@ void process_procfs_request(unsigned long rarg)
* The offset is treated as the beginning of the virtual address area * The offset is treated as the beginning of the virtual address area
* of the process. The count is the length of the area. * of the process. The count is the length of the area.
*/ */
tid = pid;
ret = sscanf(p, "task/%d/", &tid);
if (ret == 1) {
p = strchr(p, '/') + 1;
p = strchr(p, '/') + 1;
if (!strcmp(p, "mem")){ if (!strcmp(p, "stat")) {
struct vm_range *range; char tmp[1024];
struct process_vm *vm = proc->vm; int len;
if (!is_current) { /*
goto end; * pid (comm) state ppid
* pgrp session tty_nr tpgid
* flags minflt cminflt majflt
* cmajflt utime stime cutime
* cstime priority nice num_threads
* itrealvalue starttime vsize rss
* rsslim startcode endcode startstack
* kstkesp kstkeip signal blocked
* sigignore sigcatch wchan nswap
* cnswap exit_signal processor rt_priority
* policy delayacct_blkio_ticks guest_time cguest_time
*/
ans = sprintf(tmp,
"%d (%s) %c %d " // pid...
"%d %d %d %d " // pgrp...
"%u %lu %lu %lu " // flags...
"%lu %lu %lu %ld " // cmajflt...
"%ld %ld %ld %ld " // cstime...
"%ld %llu %lu %ld " // itrealvalue...
"%lu %lu %lu %lu " // rsslim...
"%lu %lu %lu %lu " // kstkesp...
"%lu %lu %lu %lu " // sigignore...
"%lu %d %d %u " // cnswap...
"%u %llu %lu %ld\n", // policy...
0, "exe", 'R', 0, // pid...
0, 0, 0, 0, // pgrp...
0, 0L, 0L, 0L, // flags...
0L, 0L, 0L, 0L, // cmajflt...
0L, 0L, 0L, 0L, // cstime...
0L, 0LL, 0L, 0L, // itrealvalue...
0L, 0L, 0L, 0L, // rsslim...
0L, 0L, 0L, 0L, // kstkesp...
0L, 0L, 0L, 0L, // sigignore...
0L, 0, thread->cpu_id, 0, // cnswap...
0, 0LL, 0L, 0L // policy...
);
dprintf("tmp=%s\n", tmp);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
ans = r->count;
} else {
eof = 1;
ans = len;
} }
if (pid != tid) { strncpy(buf, tmp + r->offset, ans);
/* We are not multithreaded yet. */ } else if (r->offset == len) {
goto end; ans = 0;
} eof = 1;
list_for_each_entry(range, &vm->vm_range_list, list) {
dprintf("range: %lx - %lx\n", range->start, range->end);
if ((range->start <= r->offset) &&
(r->offset < range->end)) {
unsigned int len = r->count;
if (range->end < r->offset + r->count) {
len = range->end - r->offset;
}
memcpy((void *)buf, (void *)range->start, len);
ans = len;
break;
}
}
goto end;
} }
if (!strcmp(p, "stat")) {
char tmp[1024];
int len;
if ((thread = find_thread(pid, tid, &lock))){
dprintf("thread found! pid=%d tid=%d\n", pid, tid);
/*
* pid (comm) state ppid
* pgrp session tty_nr tpgid
* flags minflt cminflt majflt
* cmajflt utime stime cutime
* cstime priority nice num_threads
* itrealvalue starttime vsize rss
* rsslim startcode endcode startstack
* kstkesp kstkeip signal blocked
* sigignore sigcatch wchan nswap
* cnswap exit_signal processor rt_priority
* policy delayacct_blkio_ticks guest_time cguest_time
*/
ans = sprintf(tmp,
"%d (%s) %c %d " // pid...
"%d %d %d %d " // pgrp...
"%u %lu %lu %lu " // flags...
"%lu %lu %lu %ld " // cmajflt...
"%ld %ld %ld %ld " // cstime...
"%ld %llu %lu %ld " // itrealvalue...
"%lu %lu %lu %lu " // rsslim...
"%lu %lu %lu %lu " // kstkesp...
"%lu %lu %lu %lu " // sigignore...
"%lu %d %d %u " // cnswap...
"%u %llu %lu %ld\n", // policy...
0, "exe", 'R', 0, // pid...
0, 0, 0, 0, // pgrp...
0, 0L, 0L, 0L, // flags...
0L, 0L, 0L, 0L, // cmajflt...
0L, 0L, 0L, 0L, // cstime...
0L, 0LL, 0L, 0L, // itrealvalue...
0L, 0L, 0L, 0L, // rsslim...
0L, 0L, 0L, 0L, // kstkesp...
0L, 0L, 0L, 0L, // sigignore...
0L, 0, thread->cpu_id, 0, // cnswap...
0, 0LL, 0L, 0L // policy...
);
thread_unlock(thread, &lock);
dprintf("tmp=%s\n", tmp);
len = strlen(tmp);
if (r->offset < len) {
if (r->offset + r->count < len) {
ans = r->count;
} else {
eof = 1;
ans = len;
}
strncpy(buf, tmp + r->offset, ans);
} else if (r->offset == len) {
ans = 0;
eof = 1;
}
goto end;
}
else{
dprintf("no thread found pid=%d tid=%d\n", pid, tid);
}
}
dprintf("could not find a matching entry for task/%d/%s.\n", tid, p);
goto end; goto end;
} }
/* if(thread)
* Processing for pattern "mcos%d/PID/xxx" files should be here. kprintf("unsupported procfs entry: %d/task/%d/%s\n", pid, tid, p);
*/ else
dprintf("could not find a matching entry for %s.\n", p); kprintf("unsupported procfs entry: %d/%s\n", pid, p);
end: end:
ihk_mc_unmap_virtual(buf, npages, 0); ihk_mc_unmap_virtual(buf, npages, 0);
dprintf("ret: %d, eof: %d\n", ans, eof); dprintf("ret: %d, eof: %d\n", ans, eof);
@ -801,6 +575,12 @@ dataunavail:
if (ret < 0) { if (ret < 0) {
kprintf("ERROR: sending IKC msg, ret: %d\n", ret); kprintf("ERROR: sending IKC msg, ret: %d\n", ret);
} }
if(proc)
release_process(proc);
if(thread)
release_thread(thread);
if(vm)
release_process_vm(vm);
return; return;
} }

View File

@ -34,12 +34,14 @@ static memobj_release_func_t shmobj_release;
static memobj_ref_func_t shmobj_ref; static memobj_ref_func_t shmobj_ref;
static memobj_get_page_func_t shmobj_get_page; static memobj_get_page_func_t shmobj_get_page;
static memobj_invalidate_page_func_t shmobj_invalidate_page; static memobj_invalidate_page_func_t shmobj_invalidate_page;
static memobj_lookup_page_func_t shmobj_lookup_page;
static struct memobj_ops shmobj_ops = { static struct memobj_ops shmobj_ops = {
.release = &shmobj_release, .release = &shmobj_release,
.ref = &shmobj_ref, .ref = &shmobj_ref,
.get_page = &shmobj_get_page, .get_page = &shmobj_get_page,
.invalidate_page = &shmobj_invalidate_page, .invalidate_page = &shmobj_invalidate_page,
.lookup_page = &shmobj_lookup_page,
}; };
static struct shmobj *to_shmobj(struct memobj *memobj) static struct shmobj *to_shmobj(struct memobj *memobj)
@ -112,6 +114,43 @@ void shmobj_list_unlock(void)
return; return;
} }
/***********************************************************************
* shmlock_users
*/
ihk_spinlock_t shmlock_users_lock_body = SPIN_LOCK_UNLOCKED;
static LIST_HEAD(shmlock_users);
void shmlock_user_free(struct shmlock_user *user)
{
if (user->locked) {
panic("shmlock_user_free()");
}
list_del(&user->chain);
kfree(user);
}
int shmlock_user_get(uid_t ruid, struct shmlock_user **userp)
{
struct shmlock_user *user;
list_for_each_entry(user, &shmlock_users, chain) {
if (user->ruid == ruid) {
break;
}
}
if (&user->chain == &shmlock_users) {
user = kmalloc(sizeof(*user), IHK_MC_AP_NOWAIT);
if (!user) {
return -ENOMEM;
}
user->ruid = ruid;
user->locked = 0;
list_add(&user->chain, &shmlock_users);
}
*userp = user;
return 0;
}
/*********************************************************************** /***********************************************************************
* operations * operations
*/ */
@ -120,8 +159,16 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp)
{ {
struct shmobj *obj = NULL; struct shmobj *obj = NULL;
int error; int error;
int pgshift;
size_t pgsize;
dkprintf("shmobj_create(%p %#lx,%p)\n", ds, ds->shm_segsz, objp); dkprintf("shmobj_create(%p %#lx,%p)\n", ds, ds->shm_segsz, objp);
pgshift = ds->init_pgshift;
if (!pgshift) {
pgshift = PAGE_SHIFT;
}
pgsize = (size_t)1 << pgshift;
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT); obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) { if (!obj) {
error = -ENOMEM; error = -ENOMEM;
@ -135,8 +182,10 @@ int shmobj_create(struct shmid_ds *ds, struct memobj **objp)
obj->ds = *ds; obj->ds = *ds;
obj->ds.shm_perm.seq = the_seq++; obj->ds.shm_perm.seq = the_seq++;
obj->ds.shm_nattch = 1; obj->ds.shm_nattch = 1;
obj->ds.init_pgshift = 0;
obj->index = -1; obj->index = -1;
obj->real_segsz = (obj->ds.shm_segsz + PAGE_SIZE - 1) & PAGE_MASK; obj->pgshift = pgshift;
obj->real_segsz = (obj->ds.shm_segsz + pgsize - 1) & ~(pgsize - 1);
page_list_init(obj); page_list_init(obj);
ihk_mc_spinlock_init(&obj->memobj.lock); ihk_mc_spinlock_init(&obj->memobj.lock);
@ -171,9 +220,24 @@ void shmobj_destroy(struct shmobj *obj)
extern struct shm_info the_shm_info; extern struct shm_info the_shm_info;
extern struct list_head kds_free_list; extern struct list_head kds_free_list;
extern int the_maxi; extern int the_maxi;
struct shmlock_user *user;
size_t size;
int npages;
dkprintf("shmobj_destroy(%p [%d %o])\n", obj, obj->index, obj->ds.shm_perm.mode); dkprintf("shmobj_destroy(%p [%d %o])\n", obj, obj->index, obj->ds.shm_perm.mode);
if (obj->user) {
user = obj->user;
obj->user = NULL;
shmlock_users_lock();
size = obj->real_segsz;
user->locked -= size;
if (!user->locked) {
shmlock_user_free(user);
}
shmlock_users_unlock();
}
/* zap page_list */ /* zap page_list */
npages = (size_t)1 << (obj->pgshift - PAGE_SHIFT);
for (;;) { for (;;) {
struct page *page; struct page *page;
int count; int count;
@ -200,9 +264,8 @@ void shmobj_destroy(struct shmobj *obj)
panic("shmobj_release"); panic("shmobj_release");
} }
/* XXX:NYI: large pages */
page->mode = PM_NONE; page->mode = PM_NONE;
free_pages(phys_to_virt(page_to_phys(page)), 1); free_pages(phys_to_virt(page_to_phys(page)), npages);
} }
if (obj->index < 0) { if (obj->index < 0) {
kfree(obj); kfree(obj);
@ -235,16 +298,17 @@ void shmobj_destroy(struct shmobj *obj)
static void shmobj_release(struct memobj *memobj) static void shmobj_release(struct memobj *memobj)
{ {
struct shmobj *obj = to_shmobj(memobj); struct shmobj *obj = to_shmobj(memobj);
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
struct shmobj *freeobj = NULL; struct shmobj *freeobj = NULL;
long newref; long newref;
extern time_t time(void); extern time_t time(void);
extern pid_t getpid(void);
dkprintf("shmobj_release(%p)\n", memobj); dkprintf("shmobj_release(%p)\n", memobj);
memobj_lock(&obj->memobj); memobj_lock(&obj->memobj);
if (obj->index >= 0) { if (obj->index >= 0) {
obj->ds.shm_dtime = time(); obj->ds.shm_dtime = time();
obj->ds.shm_lpid = getpid(); obj->ds.shm_lpid = proc->pid;
dkprintf("shmobj_release:drop shm_nattach %p %d\n", obj, obj->ds.shm_nattch); dkprintf("shmobj_release:drop shm_nattach %p %d\n", obj, obj->ds.shm_nattch);
} }
newref = --obj->ds.shm_nattch; newref = --obj->ds.shm_nattch;
@ -272,16 +336,17 @@ static void shmobj_release(struct memobj *memobj)
static void shmobj_ref(struct memobj *memobj) static void shmobj_ref(struct memobj *memobj)
{ {
struct shmobj *obj = to_shmobj(memobj); struct shmobj *obj = to_shmobj(memobj);
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
long newref; long newref;
extern time_t time(void); extern time_t time(void);
extern pid_t getpid(void);
dkprintf("shmobj_ref(%p)\n", memobj); dkprintf("shmobj_ref(%p)\n", memobj);
memobj_lock(&obj->memobj); memobj_lock(&obj->memobj);
newref = ++obj->ds.shm_nattch; newref = ++obj->ds.shm_nattch;
if (obj->index >= 0) { if (obj->index >= 0) {
obj->ds.shm_atime = time(); obj->ds.shm_atime = time();
obj->ds.shm_lpid = getpid(); obj->ds.shm_lpid = proc->pid;
} }
memobj_unlock(&obj->memobj); memobj_unlock(&obj->memobj);
dkprintf("shmobj_ref(%p): newref %ld\n", memobj, newref); dkprintf("shmobj_ref(%p): newref %ld\n", memobj, newref);
@ -307,9 +372,9 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
memobj, off, p2align, physp, error); memobj, off, p2align, physp, error);
goto out; goto out;
} }
if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ if (p2align != (obj->pgshift - PAGE_SHIFT)) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("shmobj_get_page(%p,%#lx,%d,%p):large page. %d\n", ekprintf("shmobj_get_page(%p,%#lx,%d,%p):pgsize mismatch. %d\n",
memobj, off, p2align, physp, error); memobj, off, p2align, physp, error);
goto out; goto out;
} }
@ -329,7 +394,8 @@ static int shmobj_get_page(struct memobj *memobj, off_t off, int p2align,
page = page_list_lookup(obj, off); page = page_list_lookup(obj, off);
if (!page) { if (!page) {
npages = 1 << p2align; npages = 1 << p2align;
virt = ihk_mc_alloc_pages(npages, IHK_MC_AP_NOWAIT); virt = ihk_mc_alloc_aligned_pages(npages, p2align,
IHK_MC_AP_NOWAIT);
if (!virt) { if (!virt) {
error = -ENOMEM; error = -ENOMEM;
ekprintf("shmobj_get_page(%p,%#lx,%d,%p):" ekprintf("shmobj_get_page(%p,%#lx,%d,%p):"
@ -398,3 +464,60 @@ out:
dkprintf("shmobj_invalidate_page(%p,%#lx,%#lx):%d\n", memobj, phys, pgsize, error); dkprintf("shmobj_invalidate_page(%p,%#lx,%#lx):%d\n", memobj, phys, pgsize, error);
return error; return error;
} }
static int shmobj_lookup_page(struct memobj *memobj, off_t off, int p2align,
uintptr_t *physp, unsigned long *pflag)
{
struct shmobj *obj = to_shmobj(memobj);
int error;
struct page *page;
uintptr_t phys = NOPHYS;
dkprintf("shmobj_lookup_page(%p,%#lx,%d,%p)\n",
memobj, off, p2align, physp);
memobj_lock(&obj->memobj);
if (off & ~PAGE_MASK) {
error = -EINVAL;
ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):invalid argument. %d\n",
memobj, off, p2align, physp, error);
goto out;
}
if (p2align != (obj->pgshift - PAGE_SHIFT)) {
error = -ENOMEM;
ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):pgsize mismatch. %d\n",
memobj, off, p2align, physp, error);
goto out;
}
if (obj->real_segsz <= off) {
error = -ERANGE;
ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):beyond the end. %d\n",
memobj, off, p2align, physp, error);
goto out;
}
if ((obj->real_segsz - off) < (PAGE_SIZE << p2align)) {
error = -ENOSPC;
ekprintf("shmobj_lookup_page(%p,%#lx,%d,%p):too large. %d\n",
memobj, off, p2align, physp, error);
goto out;
}
page = page_list_lookup(obj, off);
if (!page) {
error = -ENOENT;
dkprintf("shmobj_lookup_page(%p,%#lx,%d,%p):page not found. %d\n",
memobj, off, p2align, physp, error);
goto out;
}
phys = page_to_phys(page);
error = 0;
if (physp) {
*physp = phys;
}
out:
memobj_unlock(&obj->memobj);
dkprintf("shmobj_lookup_page(%p,%#lx,%d,%p):%d %#lx\n",
memobj, off, p2align, physp, error, phys);
return error;
} /* shmobj_lookup_page() */

File diff suppressed because it is too large Load Diff

657
kernel/sysfs.c Normal file
View File

@ -0,0 +1,657 @@
/**
* \file sysfs.c
* License details are found in the file LICENSE.
* \brief
* sysfs framework, IHK-Slave side
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#include <ihk/mm.h>
#include <ihk/types.h>
#include <ikc/queue.h>
#include <cls.h>
#include <kmsg.h>
#include <kmalloc.h>
#include <page.h>
#include <string.h>
#include <stdarg.h>
#include <arch/cpu.h>
#include <sysfs.h>
#include <sysfs_msg.h>
#include <vsprintf.h>
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) do { if (1) kprintf(__VA_ARGS__); } while (0)
static size_t sysfs_data_bufsize;
static void *sysfs_data_buf;
static int setup_special_create(struct sysfs_req_create_param *param, struct sysfs_bitmap_param *pbp)
{
void *cinstance = (void *)param->client_instance;
switch (param->client_ops) {
case (long)SYSFS_SNOOPING_OPS_d32:
case (long)SYSFS_SNOOPING_OPS_d64:
case (long)SYSFS_SNOOPING_OPS_u32:
case (long)SYSFS_SNOOPING_OPS_u64:
case (long)SYSFS_SNOOPING_OPS_u32K:
param->client_instance = virt_to_phys(cinstance);
return 0;
case (long)SYSFS_SNOOPING_OPS_s:
pbp->nbits = 8 * (strlen(cinstance) + 1);
pbp->ptr = (void *)virt_to_phys(cinstance);
param->client_instance = virt_to_phys(pbp);
return 0;
case (long)SYSFS_SNOOPING_OPS_pbl:
case (long)SYSFS_SNOOPING_OPS_pb:
*pbp = *(struct sysfs_bitmap_param *)cinstance;
pbp->ptr = (void *)virt_to_phys(pbp->ptr);
param->client_instance = virt_to_phys(pbp);
return 0;
}
ekprintf("setup_special_create:unknown ops %#lx\n", param->client_ops);
return -EINVAL;
} /* setup_special_create() */
int
sysfs_createf(struct sysfs_ops *ops, void *instance, int mode,
const char *fmt, ...)
{
int error;
va_list ap;
ssize_t n;
struct sysfs_req_create_param *param = NULL;
struct ikc_scd_packet packet;
struct sysfs_bitmap_param asbp;
dkprintf("sysfs_createf(%p,%p,%#o,%s,...)\n",
ops, instance, mode, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_createf:allocate_pages failed. %d\n", error);
goto out;
}
param->client_ops = (long)ops;
param->client_instance = (long)instance;
param->mode = mode;
param->busy = 1;
va_start(ap, fmt);
n = vsnprintf(param->path, sizeof(param->path), fmt, ap);
va_end(ap);
if (n >= sizeof(param->path)) {
error = -ENAMETOOLONG;
ekprintf("sysfs_createf:vsnprintf failed. %d\n", error);
goto out;
}
dkprintf("sysfs_createf:path %s\n", param->path);
if (param->path[0] != '/') {
error = -ENOENT;
ekprintf("sysfs_createf:not an absolute path. %d\n", error);
goto out;
}
if (is_special_sysfs_ops(ops)) {
error = setup_special_create(param, &asbp);
if (error) {
ekprintf("sysfs_createf:setup_special_create failed. %d\n", error);
goto out;
}
}
packet.msg = SCD_MSG_SYSFS_REQ_CREATE;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_createf:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_createf:SCD_MSG_SYSFS_REQ_CREATE failed. %d\n",
error);
goto out;
}
error = 0;
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n",
ops, instance, mode, fmt, error);
}
dkprintf("sysfs_createf(%p,%p,%#o,%s,...): %d\n",
ops, instance, mode, fmt, error);
return error;
} /* sysfs_createf() */
int
sysfs_mkdirf(sysfs_handle_t *dirhp, const char *fmt, ...)
{
int error;
struct sysfs_req_mkdir_param *param = NULL;
struct ikc_scd_packet packet;
va_list ap;
int n;
dkprintf("sysfs_mkdirf(%p,%s,...)\n", dirhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_mkdirf:allocate_pages failed. %d\n", error);
goto out;
}
param->busy = 1;
va_start(ap, fmt);
n = vsnprintf(param->path, sizeof(param->path), fmt, ap);
va_end(ap);
if (n >= sizeof(param->path)) {
error = -ENAMETOOLONG;
ekprintf("sysfs_mkdirf:vsnprintf failed. %d\n", error);
goto out;
}
dkprintf("sysfs_mkdirf:path %s\n", param->path);
if (param->path[0] != '/') {
error = -ENOENT;
ekprintf("sysfs_mkdirf:not an absolute path. %d\n", error);
goto out;
}
packet.msg = SCD_MSG_SYSFS_REQ_MKDIR;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_mkdirf:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_mkdirf:SCD_MSG_SYSFS_REQ_MKDIR failed. %d\n",
error);
goto out;
}
error = 0;
if (dirhp) {
dirhp->handle = param->handle;
}
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_mkdirf(%p,%s,...): %d\n", dirhp, fmt, error);
}
dkprintf("sysfs_mkdirf(%p,%s,...): %d %#lx\n", dirhp, fmt, error,
(dirhp)?dirhp->handle:0);
return error;
} /* sysfs_mkdirf() */
int
sysfs_symlinkf(sysfs_handle_t targeth, const char *fmt, ...)
{
int error;
struct sysfs_req_symlink_param *param = NULL;
struct ikc_scd_packet packet;
va_list ap;
int n;
dkprintf("sysfs_symlinkf(%#lx,%s,...)\n", targeth.handle, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_symlinkf:allocate_pages failed. %d\n", error);
goto out;
}
param->target = targeth.handle;
param->busy = 1;
va_start(ap, fmt);
n = vsnprintf(param->path, sizeof(param->path), fmt, ap);
va_end(ap);
if (n >= sizeof(param->path)) {
error = -ENAMETOOLONG;
ekprintf("sysfs_symlinkf:vsnprintf failed. %d\n", error);
goto out;
}
dkprintf("sysfs_symlinkf:path %s\n", param->path);
if (param->path[0] != '/') {
error = -ENOENT;
ekprintf("sysfs_symlinkf:not an absolute path. %d\n", error);
goto out;
}
packet.msg = SCD_MSG_SYSFS_REQ_SYMLINK;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_symlinkf:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_symlinkf:"
"SCD_MSG_SYSFS_REQ_SYMLINK failed. %d\n",
error);
goto out;
}
error = 0;
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_symlinkf(%#lx,%s,...): %d\n",
targeth.handle, fmt, error);
}
dkprintf("sysfs_symlinkf(%#lx,%s,...): %d\n",
targeth.handle, fmt, error);
return error;
} /* sysfs_symlinkf() */
int
sysfs_lookupf(sysfs_handle_t *objhp, const char *fmt, ...)
{
int error;
struct sysfs_req_lookup_param *param = NULL;
struct ikc_scd_packet packet;
va_list ap;
int n;
dkprintf("sysfs_lookupf(%p,%s,...)\n", objhp, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_lookupf:allocate_pages failed. %d\n", error);
goto out;
}
param->busy = 1;
va_start(ap, fmt);
n = vsnprintf(param->path, sizeof(param->path), fmt, ap);
va_end(ap);
if (n >= sizeof(param->path)) {
error = -ENAMETOOLONG;
ekprintf("sysfs_lookupf:vsnprintf failed. %d\n", error);
goto out;
}
dkprintf("sysfs_lookupf:path %s\n", param->path);
if (param->path[0] != '/') {
error = -ENOENT;
ekprintf("sysfs_lookupf:not an absolute path. %d\n", error);
goto out;
}
packet.msg = SCD_MSG_SYSFS_REQ_LOOKUP;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_lookupf:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_lookupf:SCD_MSG_SYSFS_REQ_LOOKUP failed. %d\n",
error);
goto out;
}
error = 0;
if (objhp) {
objhp->handle = param->handle;
}
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_lookupf(%p,%s,...): %d\n", objhp, fmt, error);
}
dkprintf("sysfs_lookupf(%p,%s,...): %d %#lx\n", objhp, fmt, error,
(objhp)?objhp->handle:0);
return error;
} /* sysfs_lookupf() */
int
sysfs_unlinkf(int flags, const char *fmt, ...)
{
int error;
struct sysfs_req_unlink_param *param = NULL;
struct ikc_scd_packet packet;
va_list ap;
int n;
dkprintf("sysfs_unlinkf(%#x,%s,...)\n", flags, fmt);
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_unlinkf:allocate_pages failed. %d\n", error);
goto out;
}
param->flags = flags;
param->busy = 1;
va_start(ap, fmt);
n = vsnprintf(param->path, sizeof(param->path), fmt, ap);
va_end(ap);
if (n >= sizeof(param->path)) {
error = -ENAMETOOLONG;
ekprintf("sysfs_unlinkf:vsnprintf failed. %d\n", error);
goto out;
}
dkprintf("sysfs_unlinkf:path %s\n", param->path);
if (param->path[0] != '/') {
error = -ENOENT;
ekprintf("sysfs_unlinkf:not an absolute path. %d\n", error);
goto out;
}
packet.msg = SCD_MSG_SYSFS_REQ_UNLINK;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_unlinkf:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_unlinkf:SCD_MSG_SYSFS_REQ_UNLINK failed. %d\n",
error);
goto out;
}
error = 0;
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error);
}
dkprintf("sysfs_unlinkf(%#x,%s,...): %d\n", flags, fmt, error);
return error;
} /* sysfs_unlinkf() */
static void
sysfss_req_show(long nodeh, struct sysfs_ops *ops, void *instance)
{
int error;
ssize_t ssize;
struct ikc_scd_packet packet;
dkprintf("sysfss_req_show(%#lx,%p,%p)\n", nodeh, ops, instance);
ssize = -EIO;
if (ops->show) {
ssize = (*ops->show)(ops, instance, sysfs_data_buf,
sysfs_data_bufsize);
if (ssize < 0) {
ekprintf("sysfss_req_show:->show failed. %ld\n",
ssize);
/* through */
}
}
error = 0;
if (ssize < 0) {
error = ssize;
}
packet.msg = SCD_MSG_SYSFS_RESP_SHOW;
packet.err = error;
packet.sysfs_arg1 = nodeh;
packet.sysfs_arg2 = ssize;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfss_req_show:ihk_ikc_send failed. %d\n", error);
/* through */
}
if (error || packet.err) {
ekprintf("sysfss_req_show(%#lx,%p,%p): %d %d\n",
nodeh, ops, instance, error, packet.err);
}
dkprintf("sysfss_req_show(%#lx,%p,%p): %d %d %ld\n",
nodeh, ops, instance, error, packet.err, ssize);
return;
} /* sysfss_req_show() */
static void
sysfss_req_store(long nodeh, struct sysfs_ops *ops, void *instance,
size_t size)
{
int error;
ssize_t ssize;
struct ikc_scd_packet packet;
dkprintf("sysfss_req_store(%#lx,%p,%p,%d)\n",
nodeh, ops, instance, size);
ssize = -EIO;
if (ops->store) {
ssize = (*ops->store)(ops, instance, sysfs_data_buf, size);
if (ssize < 0) {
ekprintf("sysfss_req_store:->store failed. %ld\n",
ssize);
/* through */
}
}
error = 0;
if (ssize < 0) {
error = ssize;
}
packet.msg = SCD_MSG_SYSFS_RESP_STORE;
packet.err = error;
packet.sysfs_arg1 = nodeh;
packet.sysfs_arg2 = ssize;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfss_req_store:ihk_ikc_send failed. %d\n", error);
/* through */
}
if (error || packet.err) {
ekprintf("sysfss_req_store(%#lx,%p,%p,%d): %d %d\n",
nodeh, ops, instance, size, error, packet.err);
}
dkprintf("sysfss_req_store(%#lx,%p,%p,%d): %d %d %ld\n",
nodeh, ops, instance, size, error, packet.err, ssize);
return;
} /* sysfss_req_store() */
static void
sysfss_req_release(long nodeh, struct sysfs_ops *ops, void *instance)
{
int error;
struct ikc_scd_packet packet;
dkprintf("sysfss_req_release(%#lx,%p,%p)\n", nodeh, ops, instance);
if (ops->release) {
(*ops->release)(ops, instance);
}
packet.msg = SCD_MSG_SYSFS_RESP_RELEASE;
packet.err = 0;
packet.sysfs_arg1 = nodeh;
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfss_req_release:ihk_ikc_send failed. %d\n",
error);
/* through */
}
if (error || packet.err) {
ekprintf("sysfss_req_release(%#lx,%p,%p): %d %d\n",
nodeh, ops, instance, error, packet.err);
}
dkprintf("sysfss_req_release(%#lx,%p,%p): %d %d\n",
nodeh, ops, instance, error, packet.err);
return;
} /* sysfss_req_release() */
void
sysfss_packet_handler(struct ihk_ikc_channel_desc *ch, int msg, int error,
long arg1, long arg2, long arg3)
{
switch (msg) {
case SCD_MSG_SYSFS_REQ_SHOW:
sysfss_req_show(arg1, (void *)arg2, (void *)arg3);
break;
case SCD_MSG_SYSFS_REQ_STORE:
sysfss_req_store(arg1, (void *)arg2, (void *)arg3, error);
break;
case SCD_MSG_SYSFS_REQ_RELEASE:
sysfss_req_release(arg1, (void *)arg2, (void *)arg3);
break;
default:
kprintf("sysfss_packet_handler:unknown message. msg %d"
" error %d arg1 %#lx arg2 %#lx arg3 %#lx\n",
msg, error, arg1, arg2, arg3);
break;
}
return;
} /* sysfss_packet_handler() */
void
sysfs_init(void)
{
int error;
struct sysfs_req_setup_param *param = NULL;
struct ikc_scd_packet packet;
dkprintf("sysfs_init()\n");
if ((sizeof(struct sysfs_req_create_param) > PAGE_SIZE)
|| (sizeof(struct sysfs_req_mkdir_param) > PAGE_SIZE)
|| (sizeof(struct sysfs_req_symlink_param) > PAGE_SIZE)
|| (sizeof(struct sysfs_req_lookup_param) > PAGE_SIZE)
|| (sizeof(struct sysfs_req_unlink_param) > PAGE_SIZE)
|| (sizeof(struct sysfs_req_setup_param) > PAGE_SIZE)) {
panic("struct sysfs_*_req_param too large");
}
sysfs_data_bufsize = PAGE_SIZE;
sysfs_data_buf = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!sysfs_data_buf) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(buf) failed. %d\n", error);
goto out;
}
param = allocate_pages(1, IHK_MC_AP_NOWAIT);
if (!param) {
error = -ENOMEM;
ekprintf("sysfs_init:allocate_pages(param) failed. %d\n",
error);
goto out;
}
param->busy = 1;
param->buf_rpa = virt_to_phys(sysfs_data_buf);
param->bufsize = PAGE_SIZE;
packet.msg = SCD_MSG_SYSFS_REQ_SETUP;
packet.sysfs_arg1 = virt_to_phys(param);
error = ihk_ikc_send(cpu_local_var(syscall_channel), &packet, 0);
if (error) {
ekprintf("sysfs_init:ihk_ikc_send failed. %d\n", error);
goto out;
}
while (param->busy) {
cpu_pause();
}
rmb();
error = param->error;
if (error) {
ekprintf("sysfs_init:SCD_MSG_SYSFS_REQ_SETUP failed. %d\n",
error);
goto out;
}
error = 0;
out:
if (param) {
free_pages(param, 1);
}
if (error) {
ekprintf("sysfs_init(): %d\n", error);
panic("sysfs_init");
}
dkprintf("sysfs_init():\n");
return;
} /* sysfs_init() */
/**** End of File ****/

View File

@ -182,7 +182,7 @@ static int zeroobj_get_page(struct memobj *memobj, off_t off, int p2align,
} }
if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */ if (p2align != PAGE_P2ALIGN) { /* XXX:NYI:large pages */
error = -ENOMEM; error = -ENOMEM;
ekprintf("zeroobj_get_page(%p,%#lx,%d,%p):large page. %d\n", dkprintf("zeroobj_get_page(%p,%#lx,%d,%p):large page. %d\n",
memobj, off, p2align, physp, error); memobj, off, p2align, physp, error);
goto out; goto out;
} }

View File

@ -1,109 +1,12 @@
/** /* bitops.c COPYRIGHT FUJITSU LIMITED 2014 */
* \file bitops.h #include <bitops.h>
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_BITOPS_H
#define HEADER_X86_COMMON_BITOPS_H
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#define for_each_set_bit(bit, addr, size) \
for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
/* /*
* Find the next set bit in a memory region. * Find the next set bit in a memory region.
*/ */
static unsigned long find_next_bit(const unsigned long *addr, unsigned long size, unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset) unsigned long offset)
{ {
const unsigned long *p = addr + BITOP_WORD(offset); const unsigned long *p = addr + BITOP_WORD(offset);
@ -146,7 +49,7 @@ found_middle:
* This implementation of find_{first,next}_zero_bit was stolen from * This implementation of find_{first,next}_zero_bit was stolen from
* Linus' asm-alpha/bitops.h. * Linus' asm-alpha/bitops.h.
*/ */
static unsigned long find_next_zero_bit(const unsigned long *addr, unsigned long find_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset) unsigned long size, unsigned long offset)
{ {
const unsigned long *p = addr + BITOP_WORD(offset); const unsigned long *p = addr + BITOP_WORD(offset);
@ -188,7 +91,7 @@ found_middle:
/* /*
* Find the first set bit in a memory region. * Find the first set bit in a memory region.
*/ */
static unsigned long find_first_bit(const unsigned long *addr, unsigned long find_first_bit(const unsigned long *addr,
unsigned long size) unsigned long size)
{ {
const unsigned long *p = addr; const unsigned long *p = addr;
@ -214,7 +117,7 @@ found:
/* /*
* Find the first cleared bit in a memory region. * Find the first cleared bit in a memory region.
*/ */
static unsigned long find_first_zero_bit(const unsigned long *addr, unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size) unsigned long size)
{ {
const unsigned long *p = addr; const unsigned long *p = addr;
@ -237,4 +140,3 @@ found:
return result + ffz(tmp); return result + ffz(tmp);
} }
#endif

View File

@ -0,0 +1,37 @@
/* bitops-__ffs.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS___FFS_H
#define INCLUDE_BITOPS___FFS_H
static inline unsigned long __ffs(unsigned long word)
{
int num = 0;
if (BITS_PER_LONG == 64) {
if ((word & 0xffffffff) == 0) {
num += 32;
word >>= 32;
}
}
if ((word & 0xffff) == 0) {
num += 16;
word >>= 16;
}
if ((word & 0xff) == 0) {
num += 8;
word >>= 8;
}
if ((word & 0xf) == 0) {
num += 4;
word >>= 4;
}
if ((word & 0x3) == 0) {
num += 2;
word >>= 2;
}
if ((word & 0x1) == 0)
num += 1;
return num;
}
#endif

View File

@ -0,0 +1,14 @@
/* bitops-clear_bit.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS_CLEAR_BIT_H
#define INCLUDE_BITOPS_CLEAR_BIT_H
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
unsigned long mask = (1UL << (nr % BITS_PER_LONG));
unsigned long *p = ((unsigned long *)addr) + (nr / BITS_PER_LONG);
*p &= ~mask;
}
#endif

8
lib/include/bitops-ffz.h Normal file
View File

@ -0,0 +1,8 @@
/* bitops-ffz.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS_FFZ_H
#define INCLUDE_BITOPS_FFZ_H
#define ffz(x) __ffs(~(x))
#endif

36
lib/include/bitops-fls.h Normal file
View File

@ -0,0 +1,36 @@
/* bitops-fls.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS_FLS_H
#define INCLUDE_BITOPS_FLS_H
static inline int fls(int x)
{
int r = 32;
if (!x) {
return 0;
}
if (!(x & 0xffff0000u)) {
x <<= 16;
r -= 16;
}
if (!(x & 0xff000000u)) {
x <<= 8;
r -= 8;
}
if (!(x & 0xf0000000u)) {
x <<= 4;
r -= 4;
}
if (!(x & 0xc0000000u)) {
x <<= 2;
r -= 2;
}
if (!(x & 0x80000000u)) {
x <<= 1;
r -= 1;
}
return r;
}
#endif

View File

@ -0,0 +1,14 @@
/* bitops-set_bit.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS_SET_BIT_H
#define INCLUDE_BITOPS_SET_BIT_H
static inline void set_bit(int nr, volatile unsigned long *addr)
{
unsigned long mask = (1UL << (nr % BITS_PER_LONG));
unsigned long *p = ((unsigned long *)addr) + (nr / BITS_PER_LONG);
*p |= mask;
}
#endif

35
lib/include/bitops.h Normal file
View File

@ -0,0 +1,35 @@
/* bitops.h COPYRIGHT FUJITSU LIMITED 2014 */
#ifndef INCLUDE_BITOPS_H
#define INCLUDE_BITOPS_H
#include <types.h>
#define __BITS_TO_LONGS(n,d) (((n) + (d) - 1) / (d))
#define BITS_TO_LONGS(nr) __BITS_TO_LONGS(nr, BITS_PER_LONG)
#define DECLARE_BITMAP(name,bits) unsigned long name[BITS_TO_LONGS(bits)]
#define for_each_set_bit(bit, addr, size) \
for ((bit) = find_first_bit((addr), (size)); \
(bit) < (size); \
(bit) = find_next_bit((addr), (size), (bit) + 1))
#ifndef __ASSEMBLY__
unsigned long find_next_bit(const unsigned long *addr, unsigned long size,
unsigned long offset);
unsigned long find_next_zero_bit(const unsigned long *addr,
unsigned long size, unsigned long offset);
unsigned long find_first_bit(const unsigned long *addr,
unsigned long size);
unsigned long find_first_zero_bit(const unsigned long *addr,
unsigned long size);
#endif /*__ASSEMBLY__*/
#include <arch-bitops.h>
#endif /*INCLUDE_BITOPS_H*/

View File

@ -0,0 +1,61 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif

View File

@ -76,6 +76,9 @@ void ihk_mc_init_user_process(ihk_mc_kernel_context_t *ctx,
void *stack_pointer, unsigned long user_pc, void *stack_pointer, unsigned long user_pc,
unsigned long user_sp); unsigned long user_sp);
void ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
unsigned long tls_base_addr);
enum ihk_mc_user_context_regtype { enum ihk_mc_user_context_regtype {
IHK_UCR_STACK_POINTER = 1, IHK_UCR_STACK_POINTER = 1,
IHK_UCR_PROGRAM_COUNTER = 2, IHK_UCR_PROGRAM_COUNTER = 2,
@ -104,4 +107,21 @@ extern unsigned int ihk_ikc_irq_apicid;
extern int gettime_local_support; extern int gettime_local_support;
void init_tick(void);
void init_delay(void);
void sync_tick(void);
struct pvclock_vsyscall_time_info {
long contents[64/sizeof(long)];
};
extern struct pvclock_vsyscall_time_info *pvti;
extern int pvti_npages;
int arch_setup_pvclock(void);
void arch_start_pvclock(void);
struct cpu_mapping;
int arch_get_cpu_mapping(struct cpu_mapping **buf, int *nelemsp);
#endif #endif

View File

@ -13,12 +13,16 @@
#ifndef IHK_DEBUG_H #ifndef IHK_DEBUG_H
#define IHK_DEBUG_H #define IHK_DEBUG_H
#include <arch-lock.h>
#include <ihk/memconst.h> #include <ihk/memconst.h>
struct ihk_kmsg_buf { struct ihk_kmsg_buf {
int tail; int tail;
int len; int len;
char str[IHK_KMSG_SIZE - sizeof(int) * 2]; int head;
int mode;
ihk_spinlock_t lock;
char str[IHK_KMSG_SIZE - sizeof(int) * 4 - sizeof(ihk_spinlock_t)];
}; };
extern int kprintf(const char *format, ...); extern int kprintf(const char *format, ...);

Some files were not shown because too many files have changed in this diff Show More