Compare commits

...

574 Commits
0.4.0 ... 1.2.2

Author SHA1 Message Date
ddc33821cf sched_yield(): avoid schedule for single thread 2016-12-05 18:10:20 +09:00
0ab7d02994 disable syscall tracker and eliminate interrupt_syscall debug msg 2016-12-05 18:10:20 +09:00
a8c4ab221b use MCS locks in signal handling code 2016-12-05 18:10:20 +09:00
87d36a7752 mcreboot-smp-x86: -t to enable turbo boost 2016-12-05 18:10:20 +09:00
998ded414c mcreboot-smp-x86: shorter sleep in waiting for /proc 2016-12-05 18:10:20 +09:00
f78d031e64 syscall and offload tracking (disabled by default) 2016-12-05 18:10:20 +09:00
4ab37dd34a schedule(): only load page table during context switch if it's different 2016-12-05 18:10:20 +09:00
8129dec2f7 Fix out-of-tree build
<build>/ihk/cokernel/Makefile.common is not found when
<build>/mckernel/kernel/Makfile tries to perform
"Make -C <build>/ihk/{cokernel,ikc}" from mckernel/kernel
2016-12-01 16:44:01 +09:00
a1035a1878 fix out of tree build 2016-12-01 12:55:34 +09:00
db169c5f90 add gcc options (-ffreestanding -fno-tree-loop-distribute-patterns)
refs #299
2016-11-29 16:28:18 +09:00
bbb55ef261 sched_setparam: thread lock is necessary when update other thread data 2016-11-28 14:04:44 +09:00
1130cafe41 ptrace: fixed for threads. 2016-11-28 11:19:30 +09:00
a1cf27e232 sched_getaffinity(): fix error code for special invalid input 2016-11-28 05:50:01 +09:00
5a1ce99d87 mcexec: fix number of threads not to exceed thread_data array 2016-11-27 07:31:52 +09:00
c7db296e1b getcpu(): expose correct NUMA id 2016-11-26 09:29:09 +09:00
f634a750c5 sched_{set/get}affinity(): fix error codes (also fixes KMP_AFFINITY behavior) 2016-11-24 21:25:16 +09:00
d07a196c8e mcexec: enable the same number of threads as CPU cores 2016-11-24 16:40:52 +09:00
8c56c75d2c process_vm_read_writev(): fix base address check for EFAULT 2016-11-24 10:40:41 +09:00
e54895efde set_mempolicy(): debug msg 2016-11-23 08:53:26 +09:00
2f8cca2d6d memcpy(): faster version using ASM rep; movsl 2016-11-23 08:51:22 +09:00
64607152ee VM: introduction of range lookup cache 2016-11-23 08:48:44 +09:00
20383ad3d0 do_process_vm_read_writev(): page size awareness optimization 2016-11-23 08:47:32 +09:00
787d34f650 introduction of ihk_mc_pt_virt_to_phys_size() 2016-11-23 08:40:33 +09:00
ae618a0c68 mcexec: remount /proc in mcexec's file NS after exec() 2016-11-22 13:22:59 +09:00
f480376153 mcoverlayfs: supported Linux kernel 4.6
add mcoverlayfs(linux-4.6.7 base)
2016-11-17 18:09:27 +09:00
e4b3a88fc6 mcexec_sys_umount(): remove debug print 2016-11-10 15:05:45 +09:00
69a5c53074 NUMA: hide non-existing nodes from /sys/devices/system/node listing 2016-11-05 16:12:08 +09:00
259583e936 mcreboot-smp-x86.sh: more white out of invalid NUMA info 2016-11-05 13:35:53 +09:00
0f826290d0 NUMA: get_mempolicy(), set_mempolicy() and mbind() implementation 2016-11-05 13:32:02 +09:00
e46f027894 mcexec/mcctrl: unmount cgroups (privately) which expose invalid NUMA info 2016-11-04 17:02:48 +09:00
3e093f6a40 sysfs: fix /sys/devices/system/node/online value 2016-11-03 16:10:29 +09:00
00996b551f mcreboot: white out non-existing NUMA information 2016-11-03 16:09:27 +09:00
24d8697cef mcexec: workaround for overlayed /sys FS directory lseek() bug
lseek() on directories under /sys filesystem that are part of an
overlayed filesystem behave differently than in the original /sys.
This causes segfault in libnuma when discovering topology
information. The patch fakes return value as it is supposed to be,
which also fixes the Intel MPI 2017 MPI_Init() crash.
2016-11-03 13:41:25 +09:00
be4f6741f9 sysfs: fix /sys/devices/system/cpu/cpuXX/online value 2016-11-03 13:39:21 +09:00
7a2f67f5f0 sysfs: eliminate unnecessary new line from /sys/devices/system/node/nodeX/distance 2016-11-03 13:37:53 +09:00
bba0425267 sysfs: fix /sys/devices/system/cpu/online value 2016-11-03 13:36:29 +09:00
beaf96b375 mcreboot/mcstop: proper error handling (revert previous state) 2016-10-28 14:29:10 +09:00
f1af1ffb8f NUMA: expose correct NUMA distances in sysfs 2016-10-27 14:29:15 +09:00
059fab2cc0 mcctrl: fix NULL pointer dereference for unbooted OS instance shutdown 2016-10-26 14:50:07 +09:00
f284a80656 Defrag memory in mcreboot.sh
Merge free physical pages to create large, physically contiguous
blocks with the following command.

    echo 1 > /proc/sys/vm/compact_memory
2016-10-25 16:35:43 +09:00
5f973ab51e IKC2: adjust master channel message queue size dynamically
Determine master channel's message queue size based on the number of
LWK CPUs so that all cores can communicate simultaneously during
syscall channel initialization.
2016-10-24 20:49:00 +09:00
60b6713957 IKC2: eliminate unused structures/fields of old IKC code 2016-10-24 15:41:27 +09:00
ebcf9a0d6d mcctrl: fix a bunch of -Wframe-larger-than warnings 2016-10-21 04:54:38 -04:00
942b7f8b78 mcreboot-smp-x86: eliminate unnecessary resource queries 2016-10-21 03:38:21 -04:00
0b0aa6c0e0 Start mcklogd before McKernel to avoid deadlock
McKernel blocks forever waiting for mcklogd to retrieve kmsg when
kmsg bufer is full with boot log and mcklogd isn't running.
2016-10-19 16:40:32 +09:00
9705a80c82 get/set_mempolicy(): support for query/set process level policy 2016-10-16 14:01:14 +09:00
99a02e2941 get_mempolicy(): store policy in per-process VM structure 2016-10-16 09:10:36 +09:00
b88d75720f __NR_gettid: use regular offloading channel (fixes unknown PID bug) 2016-10-15 11:46:01 +09:00
d2b677b6da get_mempolicy(): initial implementation 2016-10-14 21:34:32 +09:00
083645f203 mcreboot: purge Linux caches before reserving IHK resources 2016-10-14 21:34:32 +09:00
994b9a19ac NUMA: expose CPU and memory info in /proc/self/status 2016-10-14 21:34:32 +09:00
faa929e717 NUMA: add NUMA mask to process VM structure 2016-10-14 21:34:31 +09:00
3ee3a9df6d sysfs: fix bitmask and bitmask list-view display bug 2016-10-14 21:34:31 +09:00
73e1a4f1f9 NUMA: fill in /sys/devices/system/cpu/nodeX properly and sync with boot script 2016-10-14 21:34:31 +09:00
b068fde9cd NUMA: use IHK CPU and NUMA mappings for sysfs entries 2016-10-14 21:34:31 +09:00
167ea67dee NUMA: receive CPU info in array format 2016-10-14 21:34:31 +09:00
f33d85a27a eclair: support for multiple physical memory chunks 2016-10-14 21:34:31 +09:00
1e8239d72a kmalloc/pagealloc tracker: fix race condition bug 2016-10-14 21:34:31 +09:00
a51a0a6f13 page allocation tracker: support tracking partial deallocations 2016-10-14 21:34:31 +09:00
cc3f6e1a4f page_fault_process_memory_range(): fix double allocation leak 2016-10-14 21:34:31 +09:00
5db6c311f4 page alloc tracker: count freed pages in addr tracker objects 2016-10-14 21:34:31 +09:00
f4df713846 munmap(): fix memory leak in non page backed mappings 2016-10-14 21:34:31 +09:00
7176bb2a47 allow partial deallocation in page level allocation tracker 2016-10-14 21:34:30 +09:00
a6bd98cc02 MM: memory leak tracker for page level allocator 2016-10-14 21:34:30 +09:00
0f7462ae1c mm.h: eliminate global pa_allocator 2016-10-14 21:34:30 +09:00
0d8d915d82 fix KMALLOC_MIN_SIZE macro 2016-10-14 21:34:30 +09:00
8f4f68b877 eliminate arch_alloc_page() and move ihk_mc_alloc_pages() to arch independent code 2016-10-14 21:34:30 +09:00
8c0a5a5e61 page_hash_count_pages(): report page hash size in memory stat 2016-10-14 21:34:30 +09:00
ffd3f53785 page_unmap(): proper locking of hash table 2016-10-14 21:34:30 +09:00
f39fa54c39 NUMA: default policy: allocate from CPU's NUMA node 2016-10-14 21:34:30 +09:00
11125b0d68 fileobj and shmemobj: delete unused variables 2016-10-14 21:34:30 +09:00
3ae69d1290 NUMA: process CPU NUMA information 2016-10-14 21:34:30 +09:00
2929fbb803 NUMA: support multiple physical allocators 2016-10-14 21:34:30 +09:00
f4db8b96de fileobj/shmobj: release pages correctly according to dynamic page frame management 2016-10-14 21:34:30 +09:00
8eb3bf3559 physical page management: eliminate static page frame array and
maintain page structures dynamically covering only file mappings.
use hash table for address <-> page structure conversion.
2016-10-14 21:34:29 +09:00
326a4fcee4 mem_init(): parse NUMA information 2016-10-14 21:34:29 +09:00
9b82f1a52c use ihk_mc_alloc/free_pages() and eliminate direct calls to low level routines 2016-10-14 21:34:29 +09:00
f3da381752 ihk_mc_unmap_virtual: add flush_tlb_single
refs #778
2016-10-11 14:44:23 +09:00
8aa589a40c A signal may not sometimes arrive to a thread. 2016-10-04 14:35:25 +09:00
e03f377326 interrupt_syscall: interrupt valid thread 2016-10-03 00:49:56 +09:00
8d21846562 mcoverlayfs: supported Linux kernel 4.0 or rhel kernel 3.10.0-327
add mcoverlayfs(linux-3.10.0-327.36.1.el7 base)
2016-09-30 14:55:36 +09:00
3e1367caa1 mcoverlayfs: move mcoverlayfs(linux-4.0.9 base) to executer/kernel/mcoverlayfs/linux-4.0.9 2016-09-30 13:48:55 +09:00
02536b7724 Merge remote-tracking branch 'remotes/origin/ikc2'
Conflicts:
	executer/kernel/mcctrl/syscall.c
It is resolved.
2016-09-27 11:48:12 +09:00
e28725884f fix debug print 2016-09-19 17:29:41 +09:00
c2b3fb7236 Modify interrupt load balancing policy on reboot/stop
* Fix the timing of stopping irqbalance when booting McKernel
2016-09-16 19:07:07 +09:00
2f95f7cda8 Modify interrupt load balancing policy on reboot/stop
When rebooting:
1. Stop irqbalance
2. Modify /proc/irq/*/smp_affinity so that McKernel cores are not
   included
3. Start irqbalance with McKernel cores and IHK IRQ banned from
   load balancing

When stopping:
1. Stop irqbalance
2. Restore /proc/irq/*/smp_affinity
3. Restart irqbalance with the system default settings

refs #760
2016-09-16 13:04:24 +09:00
e551aa17ed execve: do not search command PATH 2016-09-14 22:22:18 +09:00
e6d4c160cd mcexec: fix how to look for command
refs #754
2016-09-13 15:56:58 +09:00
9390fe5d2c signal: send signal to thread using thread-id. not cpu-id 2016-09-12 15:43:29 +09:00
419f5e495b set*[ug]id: propagate credentials to thread pool 2016-09-12 15:40:33 +09:00
673deadf37 fix syscall return type 2016-09-12 15:40:06 +09:00
20ea65b38c fix some vDSO bugs.
- vDSO sometimes becomes invalid.
- vDSO is not succeeded for child process.
- vDSO becomes invalid when execve.
refs #744
2016-09-04 23:13:00 +09:00
84665ff699 do_page_fault_process_vm(): fix error msg format that could cause another PF 2016-09-04 10:59:50 +09:00
bfbc94dfb0 mcctrl+mcexec: fix per-proc data allocation for fork() 2016-09-02 15:08:00 +09:00
f74dcfc2a1 Modify mcreboot.sh for job scheduler
1. Don't complain when logname command doesn't exist
2016-09-01 19:27:18 +09:00
7c562d0539 support madvise(MADV_DONTFORK) 2016-09-01 11:22:53 +09:00
b5e4459a34 support AVX-512 registers 2016-08-30 18:39:33 +09:00
782122b681 mcctrl: fix to rus_vm_fault() call by kworker process 2016-08-22 13:00:28 +09:00
d550bced78 kmalloc(): use macros to define size alignment 2016-08-19 12:51:28 +09:00
a7ee3f531b sched_setaffinity(): error handling for invalid input 2016-08-19 11:52:44 +09:00
b9439947a7 kmalloc(): re-implementation of memory leak tracking 2016-08-19 11:52:00 +09:00
3b60a95f13 kmalloc()/kfree() re-implementation 2016-08-18 21:51:36 +09:00
82ae6d7458 query_free_mem_interrupt_handler(): report number of free pages as kmsg 2016-08-18 14:52:05 +09:00
7ebc34ddcc do_fork(): fix tids memory leak; additional sanity checks 2016-08-18 14:31:52 +09:00
bd6a2c2311 sys_mmap(): correct initial address check 2016-08-18 07:32:31 +09:00
5fd68eae54 PF handler: fix up various error msgs 2016-08-18 07:31:25 +09:00
f5857cfc9e MM: use ihk_mc_{alloc/free}_pages() everywhere and fix free_pages() on kmalloc()ed object bug 2016-08-17 18:02:05 +09:00
1ce1b17a85 Specify facility used by mcklogd via option
1. You can specify facility through -f option of mcreboot.sh.
   Example:
   mcreboot.sh -k 1 -f LOG_LOCAL6
   Note that you need to specify "-k 1" or "-k 2" to start mcklogd.
2. Kill mcklogd if needed in mcreboot.sh and mcstop+release.sh.
2016-08-17 17:52:44 +09:00
a2456c3ed2 Modify mcstop+release.sh for job scheduler
1. Remove ihk.ko
2. Output message to stderr and return one on error
2016-08-17 17:32:06 +09:00
01d2ea1605 do_munmap(): do TLB flush per address in remote_tlb_flush_cpu_mask() 2016-08-17 15:08:30 +09:00
15783f09a0 Modify mcreboot.sh for job scheduler
1. Add an option to specify owner of device files
2. Output message to stderr and return one on error
2016-08-17 15:07:13 +09:00
9efd568e07 do_mmap(): simplify demand paging flags; avoid zeroobj and allocate pages directly 2016-08-17 14:00:05 +09:00
1a207e19c2 clean up a couple of debug messages 2016-08-17 13:55:36 +09:00
73cf93727b clone(): use CAS for TID allocation 2016-08-16 14:18:58 +09:00
4410e702d9 devobj: fix memory leak for device file mapping 2016-08-16 14:17:59 +09:00
f584e2ec25 increase kernel stack size and eliminate unused waitq declaration in do_syscall() 2016-08-16 09:20:55 +09:00
3aa06444f4 do_syscall(): allow descheduling threads in offloaded syscalls if CPU core oversubscribed 2016-08-16 08:58:22 +09:00
c897a56c34 __notify_syscall_requester(): use CAS or IKC to notify syscall completion 2016-08-16 08:56:05 +09:00
5e9957da0f syscall_response: introduction of req_thread_status field 2016-08-16 08:53:41 +09:00
6ff2d4abe7 mcctrl: store per-process data in hash table 2016-08-15 13:47:57 +09:00
e4239f1885 mcexec: use 16 threads initially in offload handler pool 2016-08-14 14:29:10 +09:00
fbbaaf5b54 mcctrl: use GFP_ATOMIC in atomic context 2016-08-14 14:28:21 +09:00
3fa3920bb3 fix a couple of debug msgs 2016-08-14 11:30:17 +09:00
45e51fcc07 mcctrl: fix padding for 128bytes SCD message 2016-08-14 11:29:02 +09:00
0884e3d543 IHK-IKC: map queue in McKernel as cacheable 2016-08-14 11:16:40 +09:00
e3c7c9b890 mcctrl: separate waiting threads and pending requests 2016-08-12 21:52:13 +09:00
f4155cc9e8 mcstop+release-smp-x86.sh: fix OS instance discovery bug 2016-08-12 12:27:04 +09:00
a01ae91051 mcctrl: use IKC packet pools 2016-08-12 12:26:14 +09:00
daca522d25 mcctrl: move kmalloc/kfree of wait queue head out of fast path 2016-08-12 10:18:58 +09:00
ec521feb15 do_syscall(): remove invalid reference 2016-08-09 17:16:47 +09:00
d7bc947a02 mcctrl: redesign mcctrl_channels for IKC packet based syscall offloading 2016-08-09 16:49:42 +09:00
fb84d4ef11 mcctrl: thread pool based system call offload handling 2016-08-08 19:43:05 +09:00
5fbeee953a mcctrl: clean up syscall offload wait code 2016-08-07 20:55:36 +09:00
4cefb4333f mcctrl: use atomic malloc in IRQ context 2016-08-06 08:54:55 +09:00
689da07ac6 ihk_mc_ikc_init_first_local(): hold ref to master channel 2016-08-06 08:52:14 +09:00
76981bcc18 mcctrl: move procfs TID processing into dedicated work queue 2016-08-04 15:22:40 +09:00
6aae35cb3d process: transfer TIDs in bulk and reuse them locally 2016-08-02 16:59:04 +09:00
dac6f2883e mcctrl procfs: use semaphores instead of spinlocks to avoid sleeping in GFP_KERNEL kmalloc() in atomic context 2016-08-01 20:33:51 +09:00
c484f766fa schedule(): schedule a sleeping processes if it has pending signals 2016-07-28 11:42:00 +09:00
57690479bd read/patch_process_vm(): map non-LWK physical addresses properly 2016-07-22 20:48:54 +09:00
d0539a9cac eclair: make idle threads visible 2016-07-22 18:06:11 +09:00
4c8f583c0c split_large_page(): avoid panic when splitting "non-mapped" large pages 2016-07-14 17:11:52 +09:00
6118faffa9 pager_req_pfn(): use FAULT_FLAG_USER only if defined 2016-07-13 18:05:31 +09:00
dad6470c60 clone_thread: fork(2) copy sigstack infos from parent 2016-07-13 16:15:01 +09:00
46c37fc8f3 setfsgid: fix to didn't change fsgid 2016-07-13 15:54:52 +09:00
f6908f21a8 do_kill: wake PS_INTERRUPTIBLE process when send SIGKILL
sched_wakeup_thread: don't change process status if process status is PS_EXITED
2016-07-13 14:06:32 +09:00
01d9d9a5ba devobj: allow arbitrary size device file mappings 2016-07-12 17:02:19 +09:00
c43d993a4d mcstop+release-smp-x86.sh.in: unload mcctrl after OS shutdown 2016-07-11 16:40:06 +09:00
7d9bbecd7a mcctrl: use IHK OS notifiers to establish/tear down syscall channels
This patch eliminates the need for rmmod/insmod the mcctrl module
every time an OS instance is rebooted.
2016-07-11 16:22:50 +09:00
d135731398 do_syscall(): allow schedule for another thread (Intel MPI+OpenMP issue) 2016-07-05 18:54:51 +09:00
5c190beb04 save fpregs when to call sighandler
refs #50
2016-07-05 15:26:00 +09:00
fc66556f9f mcexec: error handling and propagation 2016-06-24 15:35:38 -07:00
648bacc90f device file mappings: communicate map flags and fault missing translations 2016-06-24 12:44:59 -07:00
dd37443fc7 PAPI support: performance counter's overflow.
and support mckfd fcntl.
2016-06-24 13:50:12 +09:00
e34322702a x86_init_perfctr: discover perf counters dynamically from MSRs 2016-06-22 10:47:57 -07:00
e12997e6a9 mcreboot: support for CPU cores (-c) and memory (-m) arguments 2016-06-21 09:10:06 -07:00
fabaa806d3 Revert "Make executor code include executer/config.h": breaks out-of-tree compile
This reverts commit d90900b6e6.
2016-06-21 08:51:45 +09:00
a83ad620c8 devobj: allow read only device file mappings (OFED 3.3 support) 2016-06-21 06:57:59 +09:00
d90900b6e6 Make executor code include executer/config.h
Make the code "executer/kernel/mcctrl/arch/x86_64/archdeps.c"
to include "executer/config.h" instead of
non-existent "executer/kernel/mcctrl/config.h".
2016-06-09 18:40:39 +09:00
6d9a88e9f4 binfmt_mcexec: support post-K specification 2016-06-08 09:53:39 +09:00
d0ee60f9e3 mcoverlayfs: supported only Linux kernel 4.0 2016-06-03 18:36:55 +09:00
14ec92518e KVM support: detect KVM and avoid touching unimplemented MSRs 2016-05-26 01:11:08 +09:00
435e2bdeb4 support for Linux 4.6: use get_user_pages_remote() 2016-05-24 09:39:04 +09:00
f06d8041e3 don't send SIGCONT when sending SIGSTOP derived from PTRACE_ATTACH
refs #747
2016-05-19 10:54:12 +09:00
9b35eaca42 remote_flush_tlb_cpumask() dead locking
refs #728
2016-05-10 14:02:25 +09:00
130b1f4327 update PAPI support. other process and child process monitoring. 2016-04-26 19:01:47 +09:00
921280f85c Docker support: use task_XX_vnr() functions for accessing correct namespace 2016-04-21 09:59:49 -07:00
d4a0b32f06 support large pages 2016-04-21 23:22:55 +09:00
b3bec32e99 update_process_page_table: refactor 2016-04-21 23:22:55 +09:00
2048980820 remove ihk_mc_pt_alloc_range() 2016-04-21 23:22:54 +09:00
176f6d23a9 ihk_mc_pt_virt_to_pagemap: refactor 2016-04-21 23:22:54 +09:00
328175547f Revert "fix REQ-37: remap_one_page: remove to check page size"
This reverts commit 6790126a23.

- reverted commit should remove a 'pgsize' check in remap_one_page()
  instead of a 'pgsize' check in pte_make_fileoff().
- In IA-32e, PTE format varies with page size. Therefore 'pgsize'
  parameter of pte_make_fileoff() is preferable.
2016-04-21 23:22:54 +09:00
e2e0fad849 arch_clear_host_user_space: set zero to args[2]
to avoid duplicated per_proc_list entry.
2016-04-21 23:22:54 +09:00
397bf3f4a6 wait_zombie: don't wait attached process
refs #726
2016-04-21 20:28:36 +09:00
aa77228453 resupport ptrace(PTRACE_ATTACH)
refs #733
2016-04-21 20:13:27 +09:00
82cb8f95ed update PAPI support. 2016-04-18 13:07:45 +09:00
3f2b4e7282 do_wait: unlink child from children_list if child terminated
refs #724
2016-04-14 10:25:12 +09:00
d6784bb4a5 update auto-generated files 2016-04-11 22:25:53 +09:00
1bb948f43b hwloc support 2016-04-11 22:25:27 +09:00
2a1823d52c vdso: set enable bit of pvti_msr 2016-04-11 22:20:39 +09:00
89943dc5ba vdso: set physical address at pvti_msr 2016-04-11 22:20:39 +09:00
fceb02a44a vdso: add zero clear for pvti 2016-04-11 22:20:38 +09:00
7298d8e179 vdso: correct pvti array element type
struct pvclock_vsyscall_time_info <-- struct pvclock_vcpu_time_info
2016-04-11 22:20:38 +09:00
6f32544dde vdso: add static cast 2016-04-11 22:20:38 +09:00
10d248b3cc mcexec: include config.h 2016-04-11 22:20:38 +09:00
fb32120659 make mcoverlayfs optional (default: enabled) 2016-04-02 15:43:35 -04:00
73de203c16 update auto-generated files 2016-03-28 22:57:45 +09:00
41bb2ab5e6 support vdso which borrows clocksource from linux 2016-03-28 22:57:44 +09:00
a587c8f5e5 x86: encode cpu# in IA32_TSC_AUX and size of GDTe#15 2016-03-28 22:57:44 +09:00
0c53a5ca35 add NOPHYS which means no physical memory 2016-03-28 22:57:44 +09:00
c760a01a79 add pte_get_attr() 2016-03-28 22:57:44 +09:00
a2c29e8abf correct the value of tod_data.origin
tod_data.origin should hold a time when TSC is zero.
2016-03-28 22:57:39 +09:00
18add6a9bd shmctl(IPC_RMID): fix wrong owner/creator checking (revised)
Don't check owner/creator of the segment in case of superuser.
2016-03-28 16:02:24 +09:00
a083e6c2bf Revert "shmctl(IPC_RMID): fix wrong owner/creator checking"
This reverts commit 8b5b075f4c.

The reverted commit modifies IPC_SET instead of IPC_RMID.
2016-03-28 16:00:39 +09:00
a2548f5421 Revert "fix REQ-42"
This reverts commit 4a0682bbc1.

The reverted commit appears to be wrong, for example:
- arch_range_check()'s arguments and parameters are mismatch.
- arch_range_check() implementation is not checking range.

Conflicts:
	kernel/syscall.c
2016-03-28 13:51:57 +09:00
6790126a23 fix REQ-37: remap_one_page: remove to check page size 2016-03-27 14:05:00 +09:00
1195549f41 fix REQ-19: some syscalls change how to access user space 2016-03-27 11:43:53 +09:00
b0096a2740 fix REQ-51 2016-03-26 12:23:51 +09:00
a11479eba8 fix REQ-48 2016-03-25 13:05:53 +09:00
12eaea401e fix REQ-46 2016-03-25 12:59:18 +09:00
31595b7409 fix REQ-43 2016-03-25 12:57:31 +09:00
4a0682bbc1 fix REQ-42 2016-03-24 19:14:50 +09:00
932a287437 fix REQ-40 2016-03-24 13:46:13 +09:00
670741ae40 fix REQ-39 2016-03-24 13:45:15 +09:00
70b27e06ff eclair: change default kernel to ./mckernel.img 2016-03-23 20:00:57 +09:00
4c38ddb623 update auto-generated files 2016-03-23 20:00:57 +09:00
6f00ddced6 move eclair from ihk repository 2016-03-23 20:00:57 +09:00
c0eecd63c9 update auto-generated files 2016-03-23 20:00:57 +09:00
1fd0b03e78 move config.h.in
from executer/kernel/mcctrl/config.h.in
to   executer/config.h.in
2016-03-23 20:00:57 +09:00
6c59de9300 expand AC_PROT_CC only once 2016-03-23 20:00:57 +09:00
b1309a5d53 map PIE at map_end instead of at user_start 2016-03-23 19:14:28 +09:00
489cd6d1a2 refactor prepare_process_ranges_args_envs() 2016-03-23 19:14:28 +09:00
c9cc4330c8 mincore: take into account pages cached in memobj 2016-03-23 19:14:28 +09:00
604f846cd2 mincore: check [start..start+len) is in user region 2016-03-23 19:14:28 +09:00
e939cf6862 mincore: cosmetic changes 2016-03-23 19:14:28 +09:00
72f2e5ebe0 shmobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
bd7dddd415 fileobj: implement lookup_page method 2016-03-23 19:14:28 +09:00
fbd9dc878b memobj: add lookup_page method 2016-03-23 19:14:28 +09:00
d6c51ff997 treat memory devices as regular files,
to enable processes to mmap() /dev/zero
2016-03-23 19:14:27 +09:00
86ac51157c add error checks to shmctl(SHM_UNLOCK) 2016-03-23 19:14:27 +09:00
b73fa2b972 add error checks to shmctl(SHM_LOCK) 2016-03-23 19:14:27 +09:00
798f69bceb add has_cap_ipc_lock() 2016-03-23 19:14:27 +09:00
e8be52a1ff shm: trace the amount of locked segment per user 2016-03-23 19:14:27 +09:00
8b5b075f4c shmctl(IPC_RMID): fix wrong owner/creator checking
Don't check owner/creator of the segment in case of superuser.
2016-03-23 19:14:27 +09:00
b214fc278a add has_cap_sys_admin() 2016-03-23 19:14:27 +09:00
b3ae7f46bd add rlim_t (a type of rlim_cur and rlim_max) 2016-03-23 19:14:27 +09:00
48167d3223 shmget: add "shmflg" checks for SHM_HUGE* 2016-03-23 19:14:27 +09:00
d65135c040 move sys_shmget() into arch-dependent code 2016-03-23 19:14:27 +09:00
1761acc4c3 eliminate geteuid(), getegid() and getpid() 2016-03-23 19:04:32 +09:00
d4d93df032 mmap: add "flags" checks for MAP_HUGE* 2016-03-23 19:04:32 +09:00
261bddb999 add a member pgshift into struct vm_range
pgshift indicates a page size in the range.
2016-03-23 19:04:32 +09:00
1a3bc851af mprotect: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
15f572ef9c mmap: return -ENOMEM if speicified range is out of range 2016-03-23 19:04:32 +09:00
81690c5b5a mmap: cosmetic changes 2016-03-23 19:04:32 +09:00
832c0f9afd refactor copy_user_ranges() 2016-03-23 19:04:32 +09:00
f92cac7751 add type casting to the argument of getlong_user() 2016-03-23 19:04:32 +09:00
e74eb1dd51 add some prototypes to <memory.h> 2016-03-23 19:04:32 +09:00
8f7b9072ea refactor some copyin/copyout functions
- copy_from_user()
- getlong_user()
- getint_user()
- copy_to_user()
- setlong_user()
- setint_user()
2016-03-23 19:04:32 +09:00
4595aa3079 pte_visitor_t(): change "pgsize" into "pgshift" 2016-03-23 19:04:32 +09:00
807d294ac4 signalfd4: fix initialize 2016-06-03 20:58:02 +09:00
c947dd0d49 sysfs: support /sys/devices/system/cpu/online 2016-03-22 20:25:34 +09:00
d192e6c0fe modify PAPI support 2016-03-22 15:52:59 +09:00
7dbbcb362f add PAPI support 2016-03-22 15:27:19 +09:00
593cf98015 add ACSL annotation 2016-03-16 15:42:32 +09:00
8dd9f5ef3f support profil 2016-03-12 16:47:19 +09:00
0eaf058a4f mcexec: -lrt to Makefile.in for supporting clock_gettime() on SUSE 2016-03-12 05:24:14 +09:00
1aac2c8e23 add CPU timer initialization (refs #402)
There is no actual initialization in x86 now.
The initialization rely on hardware reset and Linux initialization.
2016-03-11 19:20:37 +09:00
70e8dd7979 remove initialization of TSC (refs #362) 2016-03-11 19:17:29 +09:00
eb0700359b fix REQ-36 2016-03-10 10:33:38 +09:00
3f16a9443e ptrace_report_signal: save debug regs before to send SIGCHLD to tracer 2016-03-09 22:29:51 +09:00
bf0cf0a346 fix REQ-31 2016-03-08 15:19:03 +09:00
14b868907b fix REQ-27 2016-03-07 18:52:08 +09:00
dbc778e4fa support getrusage (work in progress) 2016-03-07 17:06:44 +09:00
7fac03d4de sysfs: support /sys/devices/system/cpu/offline,online,possible,present 2016-03-04 13:48:06 +09:00
26c0180374 rwlock_reader_lock: fix lock list jammed up 2016-03-03 22:47:48 +09:00
8ebb3a4231 schedule: migration free last thread if terminated 2016-03-03 22:44:44 +09:00
f1f1ba9c8c mcs_rwlock_reader_lock: temporary fix 2016-03-01 19:11:42 +09:00
6ce00b5f0f sysfs: samples of snooping ops 2016-02-29 19:59:04 +09:00
4ec0e02a89 sysfs: add snooping ops 2016-02-29 19:23:01 +09:00
8f9192ac36 mcctrl: workaround for out-of-tree build (2/2)
- update auto-generated file
2016-02-29 19:18:08 +09:00
80ce123ab6 mcctrl: workaround for out-of-tree build (1/2) 2016-02-29 19:18:08 +09:00
1dc8513cd3 fix REQ-20 2016-02-26 16:18:30 +09:00
b0054643c0 REQ-18 2016-02-26 16:17:23 +09:00
972ff73ecf mcexec: fix readlink
refs #692
2016-02-25 16:08:42 +09:00
1f8a859b47 mcctrl: update auto-generated files 2016-02-24 21:34:48 +09:00
2601d8a36f mcctrl: use zap_page_range() instead of madvise() 2016-02-24 21:34:48 +09:00
a713c2fcaa fix REQ-16 2016-02-24 20:58:07 +09:00
c4c5e435cc fix REQ-12 2016-02-24 20:57:45 +09:00
853b56c784 mcreboot-smp-x86.sh: add mount to ceate /tmp/mcos/linux_proc from /proc 2016-02-24 19:24:37 +09:00
863a5c5e5f fix REQ-2, REQ-6, REQ-8 2016-02-23 16:32:17 +09:00
ebce1cb031 Merge branch 'master' of postpeta.pccluster.org:mckernel 2016-02-22 13:34:00 +09:00
fff7744907 mcklogd support 2016-02-22 13:32:20 +09:00
27c3ed7e96 remove debug print 2016-02-21 15:17:42 +09:00
e2b28da32f signal handler support gdb stepi command 2016-02-21 14:55:34 +09:00
2c50b716fd support setitimer/getitimer 2016-02-19 15:25:05 +09:00
307b2b8da5 clock_gettime: support clock_id CLOCK_PROCESS_CPUTIME_ID and CLOCK_THREAD_CPUTIME_ID 2016-02-18 17:43:13 +09:00
eba2be8a35 support times 2016-02-18 13:14:18 +09:00
a997af71be support tkill
refs #664
2016-02-17 12:48:12 +09:00
e7c37b8000 mcreboot-smp-x86.sh: fix Failed to mount /sys/devices/virtual/mcos/mcos0/sys 2016-02-16 16:05:40 +09:00
8c40f94aa8 /proc/<PID>/mem: support read/write 2016-02-16 13:21:29 +09:00
da13bd408a mcexec: add to initialize some structures (REQ-56)
refs #718
2016-02-15 18:20:58 +09:00
c328d26b8d procfs(/proc/<PID>/task/<TID>/stat): fix memory corruption
refs #722
2016-02-15 15:10:00 +09:00
6cda6792a9 process_msg_init_acked: don't use PA 2016-02-14 22:47:52 +09:00
2d3fda1d0b flatten_strings: fix align (REQ-1) 2016-02-14 22:36:58 +09:00
5d43c135db procfs: (temporary fix) unsupported files are closed 2016-02-10 17:10:54 +09:00
a866192db7 refactoring /proc 2016-02-10 08:11:02 +09:00
c0cc6ac6db Add skeleton for perf_event_open. 2016-02-09 14:54:53 +09:00
14c5bc08c2 mcexec: check Linux version from actual kernel tree instead of system wide include 2016-02-09 14:07:08 +09:00
7f01d273d0 mcctrl: fix out-of-tree build (not finding config.h) 2016-02-09 12:45:58 +09:00
137e0a799c mcexec: unshare and mount request through mcctrl 2016-02-08 16:27:03 +09:00
f214ff1b57 mcctrl: add MCEXEC_UP_SYS_MOUNT, MCEXEC_UP_SYS_UNSHARE 2016-02-08 16:00:52 +09:00
0ce698eb1f mcexec: support for /sys mounted by mcoverlayfs 2016-02-08 11:36:03 +09:00
e601248bdc procfs: fix mcos%d/PID/auxv size 2016-02-08 09:38:27 +09:00
d8eeab9b89 mcoverlayfs: enable out of tree compilation 2016-02-01 00:35:53 +09:00
fdf031ac16 procfs: chown procfs entries (temporary hack)
refs #651
refs #699
2016-01-28 16:29:46 +09:00
1ffe740153 sysfs sample 2016-01-26 18:08:25 +09:00
72968d613e support sysfs interface for mcctrl 2016-01-26 18:08:25 +09:00
2e98f875c3 sysfs: attempt to remove empty directories only 2016-01-26 18:08:25 +09:00
a6cb9a6b93 sysfs: lookup_i(): refactoring 2016-01-26 18:08:25 +09:00
da0a91b9f7 mcctrl: denote full path in /proc/PID/exe 2016-01-26 16:21:52 +09:00
f093786bec x86: populating PML4e and PDPTe is now lock-free 2016-01-25 09:17:06 +09:00
368f155328 sigaction: support SA_NODEFER
refs #698
2016-01-21 18:48:10 +09:00
425f920013 mcctrl: delete procfs entries recursively to avoid leaking 2016-01-21 18:15:59 +09:00
dbddf37579 set termsig to mcexec spawned process 2016-01-21 12:08:47 +09:00
fa7a5ccd11 support /proc/self/exe (needed for GDB to attach to an existing process) 2016-01-19 18:23:02 +09:00
172bf0a389 sched_setaffinity: add permission check 2016-01-15 12:05:18 +09:00
9bafd166e3 futex: support FUTEX_CLOCK_REALTIME 2016-01-14 16:18:49 +09:00
2e31b8abd1 clock_gettime: clock_id != CLOCK_REALTIME -> offload to linux 2016-01-13 14:04:06 +09:00
a42ee00101 NR_execve: initialize local variable 'shell'
refs #696
2016-01-13 11:16:19 +09:00
f6935b0869 ptrace_setsiginfo: update recieved siginfo 2016-01-11 17:37:29 +09:00
03a7763a5e ptrace_conf: set received siginfo to default siginfo 2016-01-11 17:10:30 +09:00
3a2f7b0106 clone: support CLONE_PARENT 2016-01-11 16:49:02 +09:00
2819ec2197 fix extra copy which might cause page faults 2016-01-06 21:12:57 +09:00
f7d81a9281 fix typo 2016-01-06 21:12:57 +09:00
914faf042d add missing kfree() for channel lookup table 2016-01-06 21:12:57 +09:00
75c6a94839 delete struct member 'type' from address_space structure 2016-01-06 20:17:00 +09:00
f7b5b48266 support x2apic 2016-01-06 13:53:02 +09:00
f9bd83c726 ptrace: fix PTRACE_GETREGSET, PTRACE_SETREGSET bug
refs #608
2015-12-28 19:45:50 +09:00
edc275ce4f delete free_list_lock 2015-12-28 11:31:42 +09:00
d00ea61d1a ptrace_wakeup_sig: fix thread lock 2015-12-28 10:33:07 +09:00
01117e92c9 append file path to symlink if link path is absolute
refs #643
2015-12-25 15:50:39 +09:00
d477096cb0 getrlimit, setrlimit: offload to linux when an unknown parameter was specified
refs #660
2015-12-25 15:35:33 +09:00
f44ddfa3b3 support sigtimedwait 2015-12-24 12:35:45 +09:00
e0acd254b1 do_process_vm_read_writev: use process hash for remote process search 2015-12-22 09:47:00 +09:00
d0507f7e9f process_read/write_vm(): fix LTP bugs 2015-12-18 15:58:51 +09:00
0f8b2aba22 reset signal handlers when execve called 2015-12-18 12:46:53 +09:00
7e5c7445e2 fix ptrace_detach bug
refs #662
2015-12-16 17:41:57 +09:00
a055fb525d sysfs sample 2015-12-16 13:42:30 +09:00
8cb72df663 support McKernel's sysfs tree 2015-12-16 13:42:30 +09:00
e805249651 add strrchr() 2015-12-16 13:42:30 +09:00
06a7889e1f chown root mcexec 2015-12-15 16:22:14 +09:00
20deed09f0 mcexec: support for /proc mounted by mcoverlayfs 2015-12-14 14:47:05 +09:00
bb81f84709 support PIE executable for PVAS 2015-12-14 11:05:28 +09:00
5c1dad1660 GDB: async-shell.exp
refs #650
2015-11-26 17:07:13 +09:00
7f2220b8e9 set '\0' termination to readlink result.
refs #643
2015-11-26 16:58:15 +09:00
65dda3f24e mcoverlayfs: support mount options(nocopyupw, nofscheck) 2015-11-25 15:34:58 +09:00
544971d665 modify for PVAS 2015-11-25 14:27:20 +09:00
dbddab4356 mcoverlayfs: add overlayfs of the original(kernel 4.0.9) 2015-11-25 13:23:49 +09:00
12eb8a9bb0 mcctrl: move mcctrl to executer/kernel/mcctrl 2015-11-24 15:42:04 +09:00
828a3ea57a futex(): support for cross address-space futexes 2015-11-24 14:58:04 +09:00
eb6de9d1de delete debug code 2015-11-13 15:10:14 +09:00
42c8ef6539 do_fork(): fix CLONE_PARENT_SETTID bug 2015-11-13 12:46:09 +09:00
780d4fc29b futex_wait(): support for FUTEX_CLOCK_REALTIME 2015-11-13 12:46:02 +09:00
94fcc5bb9a futex_wait: add to check signal 2015-11-12 09:38:36 +09:00
e822fc47dd fix dead locking when kill subthreads 2015-11-11 23:03:43 +09:00
26492a2895 vsyscall_gettimeofday: make timeval from TSC 2015-11-11 19:45:14 +09:00
1a5ff7f535 gettimeofday: gather variables into new struct 2015-11-11 18:31:33 +09:00
4c181d7fc0 smp-x86: add supports for dump analyzer 2015-11-09 16:06:55 +09:00
be78eb752e time_init: fix zero divide on KVM 2015-11-06 19:31:42 +09:00
0ad7c8ac50 nanosleep: fix arguments to be delegated 2015-11-06 19:31:42 +09:00
e9458a6cd3 fix ptrace02 failed 2015-10-30 16:59:03 +09:00
9e3b0b5866 bug fix 'GDB: missing parent-child relationship'
refs #641
2015-10-30 15:06:27 +09:00
0eaa27291a thread: move clear_child_tid, etc. to main structure 2015-10-29 11:01:27 +09:00
0b07dd1b79 support madvise(MADV_REMOVE) partially
This MADV_REMOVE works with a mapping which is
- created with shmat() and
- not sharing memobj with other mappings.
2015-10-28 18:41:28 +09:00
c25f8c7a39 support settimeofday() 2015-10-27 19:21:50 +09:00
9e53ae20d4 add memory barriers
- rmb()
- wmb()
2015-10-27 19:21:50 +09:00
09c9ee58d1 add 64bit atomic operations
- ihk_atomic64_t
- IHK_ATOMIC64_INIT()
- ihk_atomic64_read()
- ihk_atomic64_inc()
2015-10-27 19:21:50 +09:00
153a59a6f4 gettimeofday: avoid per-cpu data in calculation
Because it is difficult to safely update per-cpu data of other cpus in
settimeofday().
2015-10-27 19:21:50 +09:00
cad72a8562 when SIGXCPU or SIGXFSZ, set coredump bit to exit status 2015-10-22 20:57:37 +09:00
343bfbd30a rename back status field 2015-10-22 20:26:50 +09:00
4e4f1208f7 delete unused member 2015-10-19 20:12:26 +09:00
a325a78866 refactoring to send signal 2015-10-15 17:10:02 +09:00
6ae99454da delete debug print 2015-10-15 06:51:41 +09:00
04e193de13 refactoring process structures 2015-10-13 23:04:08 +09:00
2ca46fabfd support reader/writer lock 2015-10-02 14:05:10 +09:00
5b737b499d fix cmpxchgq operand 2015-10-02 14:04:05 +09:00
cb4f3a4d65 take into account args/envs' offset in page
- prepare_process_ranges_args_envs()
2015-10-01 21:08:42 +09:00
51789fcd38 initialize idle_vm for page faluts 2015-10-01 21:08:35 +09:00
9f50c5dc3a mcexec_wait_syscall: handle request even if signaled (reworked) 2015-09-29 19:53:40 +09:00
cd905f7ad1 Revert "mcexec_wait_syscall: handle request even if signaled"
This reverts commit d862f345be.
2015-09-29 19:52:36 +09:00
79266f6b97 x86_issue_ipi: keep interrupt disabled while issuing IPI 2015-09-29 19:10:01 +09:00
a666b69c2c make x86_issue_ipi() call wait_icr_idle() 2015-09-29 19:10:01 +09:00
47e8552eba move wait_icr_idle() before x86_issue_ipi() 2015-09-29 19:10:00 +09:00
8dd9175411 schedule: fix null pointer dereference 2015-09-29 19:10:00 +09:00
f08e0c0054 guess whether MSR_PLATFORM_INFO exists or not 2015-09-29 19:10:00 +09:00
d862f345be mcexec_wait_syscall: handle request even if signaled 2015-09-24 21:35:30 +09:00
a14768c49a kmalloc: fix missing unlock on out-of-memory path 2015-09-18 21:26:15 +09:00
56e57775e7 clone: fix error message 2015-09-18 21:26:15 +09:00
b3b752ba41 nanosleep: use copy_from_user instead of direct access 2015-09-17 21:46:32 +09:00
7b32f2f73b nanosleep: fix tscs_rem underflow issue 2015-09-17 21:46:26 +09:00
ea5a1a8693 nanosleep: update *rem whenever signaled 2015-09-17 21:44:49 +09:00
92f8fb2b2b nanosleep: use copy_to_user instead of direct access 2015-09-17 21:44:49 +09:00
a3e440414d nanosleep: cosmetic change 2015-09-17 21:44:49 +09:00
10ba03ccea mcreboot-smp-x86.sh: fix querying free irq 2015-09-17 13:19:07 +09:00
ccb7c30a05 page_fault_handler(): reenable preempt after failed PF when process is exiting 2015-09-17 10:05:32 +09:00
7dfeb8e7ce create demand-paging mapping in case of MAP_SHARED
On current McKernel, only mappings for demand paging can be shared.
Therefore, if MAP_SHARED and MAP_ANONYMOUS are specified and
anon_on_demand is disabled, then mmap(2) should create a mapping which
is for demand paging and is entirely populated with physical pages.
2015-09-16 21:38:00 +09:00
b1b706453f vsyscall: send SIGSEGV to the caller if syscall fails
On CentOS 7 (RHEL 7?), "errno" isn't set when vsyscall_gettimeofday
fails. So, in such case, vsyscall_gettimeofday send SIGSEGV to the
caller to report failure of gettimeofday operation.
2015-09-16 21:37:11 +09:00
bd5708286d make sys_gettimeofday() use copy_to_user() 2015-09-16 21:26:32 +09:00
c8a13cf213 make gettimeofday ignore NULL parameter 2015-09-16 21:26:24 +09:00
5ad0a03d18 make gettimeofday handle second parameter (timezone) 2015-09-16 21:25:29 +09:00
3819eec03f cosmetic changes
- sys_gettimeofday()
2015-09-16 21:13:12 +09:00
40b8587a8a schedule(): sync CPU_FLAG_NEED_RESCHED flag with clone and migrate 2015-09-16 19:22:40 +09:00
e7b1115572 mcreboot-smp-x86.sh: introduction of ihk_ikc_irq_core argument 2015-09-14 17:30:25 +09:00
e1a01803d0 disable demand paging on ANONYMOUS mappings unless anon_on_demand kernel argument is passed 2015-09-14 17:26:37 +09:00
69f4b0e1ad gettimeofday()/nanosleep(): check arguments, return on pending signal 2015-09-14 17:05:30 +09:00
0909a5bed5 tracee context is broken when tracee call execve 2015-09-03 10:05:25 +09:00
9dd224385e When SIGSEGV occurred on a tracee process, a tracee process freezes. 2015-09-01 17:37:56 +09:00
4176c59fd3 using d_path for solution to file path. 2015-08-28 13:01:34 +09:00
afeee5432f When envp is NULL, execve is delayed. 2015-08-28 13:00:45 +09:00
9ae5bcf46e gettimeofday(): an implementation based on CPU invariant TSC support 2015-08-24 23:53:56 +02:00
b8f166e608 mcreboot-smp-x86.sh: handle resource allocation after unloading; mcstop+release-smp-x86.sh 2015-08-22 18:55:53 +09:00
c85a9b99e1 a couple of cosmetic changes of debug messages 2015-08-22 18:53:14 +09:00
7c816a6b73 an implementation of the Mellor-Crummey Scott (MCS) lock 2015-08-20 15:26:52 +09:00
5a0cd3f53f ptrace_detach when exiting
refs #590
2015-08-18 18:03:09 +09:00
9fa62adfe7 execve(): stay compliant with locked context switching 2015-08-10 14:18:11 +09:00
f0ab8ec89a sched_request_migrate(): change CPU flags atomically 2015-08-10 12:45:59 +09:00
f4cc82578d check_need_resched(): no thread migration in IRQ context 2015-08-10 12:43:35 +09:00
9ba40dc0ff schedule(): hold runq lock for the entire duration of context switching
releasing the runq lock after loading page tables but before the actual
context switch can leave execution in an inconsistent if the current
process is descheduled from an IRQ between these two steps.
this patch holds the runq lock with IRQs disabled and makes the context
switch a single atomic operation.
2015-08-10 12:37:12 +09:00
8d6c97ea5c schedule(): disable auto thread migration 2015-08-07 16:07:31 +09:00
386f59000a mcreboot-smp-x86.sh.in: grant real user rw permission on /dev/mcos* 2015-08-07 13:33:44 +09:00
215cd370a1 ap_init(): clean up AP boot kernel messages 2015-08-07 10:57:59 +09:00
0a0e2c04a0 support for dynamically toggling time sharing when CPU is oversubscribed 2015-08-07 08:51:50 +09:00
aa191b87d3 schedule(): use XSAVE/XRSTOR and swap floating point registers in context switch 2015-08-07 08:41:00 +09:00
d5c243571f cpu_clear_and_set(): atomic CPU mask update in migration code 2015-08-06 10:49:55 +09:00
328e69a335 schedule(): do not preempt while holding spinlocks or while in offloaded syscall 2015-08-06 10:36:13 +09:00
b77755d0f7 obtain_clone_cpuid(): always start from CPU 0 and fill in cores linearily 2015-07-28 20:20:47 +09:00
d7bae14707 TEMPORARY: schedule(): move threads when core is explicitly oversubscribed 2015-07-28 20:12:58 +09:00
4e58d08f5c schedule_timeout(): give a chance to other process in spin sleep if CPU core is oversubscribed 2015-07-28 20:06:56 +09:00
9b1e691588 fix thread migration code (i.e., sched_setaffinity())
- moved migration code into idle() process and updated schedule() to detect
  when a thread has moved to another CPU in order to avoid doing housekeeping
  on behalf of the original one
- start CPU head from core 0
- keeps track of nested interrupts
2015-07-24 20:09:17 +09:00
3988b0fc61 keep track of IRQ context and don't do thread migration there 2015-07-23 16:56:58 +09:00
54eb345847 settid(): prevent modifying tid after thread migration 2015-07-23 16:51:24 +09:00
bbe7aef95b fix calling do_signal (argument lacked) 2015-07-17 10:18:43 +09:00
1ff4cf68c2 support SA_RESTART flag and restart syscall 2015-07-16 16:33:14 +09:00
1bc84d3feb modify to copy credentials 2015-07-13 15:29:26 +09:00
f7d78c8b7d sched_getaffinity(): return EINVAL for 0 lenght request (fixes LTP sched_getaffinity01) 2015-07-10 11:00:43 +09:00
7647c99cc2 do_migrate(): disable IRQ while holding migq_lock to avoid deadlocking with reschedule interrupts 2015-07-09 15:23:28 +09:00
43a774fbfc sched_setaffinity(): undo target core change, avoid abort on length mismatch 2015-07-09 11:00:26 +09:00
a029bcac37 mcreboot-smp-x86: find unused IRQ line and pass start vector to ihk_smp_x86.ko 2015-07-07 09:07:16 +09:00
bd913c503b sched_setaffinity(): find an actual target core 2015-07-03 11:59:52 +09:00
e838affde8 fix to compile error on CentOS 7 2015-07-02 17:08:35 +09:00
59ee251e1c fix /proc/pid/mem, /proc/pid/status, /proc/pid/cmdline 2015-07-02 00:22:35 +09:00
fa79db3bcc fix out of tree build 2015-07-01 23:58:50 +09:00
b7c5cba361 fix to compile on CentOS 6 2015-07-01 23:57:40 +09:00
382614ddae pstate: use MSR_NHM_TURBO_RATIO_LIMIT as maximum single-core turbo ratio 2015-07-01 22:18:38 +09:00
aa959c6b34 temporary fix for CentOS 6.x 2015-06-30 18:19:53 +09:00
aabc3d386d support a function to execute mcexec automatically. 2015-06-30 17:47:01 +09:00
4ebe778ede vm->exiting: deal with exit_group() and concurrent page faults 2015-06-25 16:04:04 +09:00
fbb776e4fb cpu init: support for no_turbo kernel argument 2015-06-25 12:18:27 +09:00
41b85281a4 mcctrl: introduction of RUS page hash to handle page refcounts properly 2015-05-31 15:42:39 +09:00
5532e3c663 mcreboot script for new IHK SMP-x86 I/F 2015-05-26 14:41:28 +09:00
2af2b1205f temporary fix for setfsuid/setfsgid 2015-05-19 06:27:59 +09:00
7d5a68be1b add PID and GID to /proc/pid/status
add /proc/pid/cmdline

refs #445
refs #447
2015-05-18 17:45:37 +09:00
f4162dff52 some signals set siginfo.si_code 2015-04-14 15:11:36 +09:00
a0d909af75 add supports for dump analyzer 2015-03-31 12:59:53 +09:00
63669b7f71 support /proc/pid/status for LTP mmap14 2015-03-28 14:20:07 +09:00
4946964ed0 update copyright notices 2015-03-27 14:50:09 +09:00
5f19842a6a support for process_vm_readv()/process_vm_writev() 2015-03-25 19:44:56 +09:00
9271d5346d add ACSL annotation to cpu.c 2015-03-25 15:54:08 +09:00
7bba05cfa4 Revise use of iov_base in ptrace_read_regset() and ptrace_write_regset(). 2015-03-20 20:33:40 +09:00
c2a1f933e8 Set tid (instead of pid) for ptrace event message of
PTRACE_EVENT_{FORK,VFORK,CLONE,VFORKDONE}.
Specify 2nd argument as pid (instead of -1) of function findthread_and_lock(),
to find tracee process in ptrace subroutines.
(gdb testsuite gdb.base/watch_thread_num.exp)
2015-03-20 13:22:00 +09:00
055769254d implement mlockall()/munlockall() for LTP syscall 2015-03-19 16:46:31 +09:00
786ae83380 add arch-dependent mman.h 2015-03-19 16:36:57 +09:00
8c662c83be implement mincore(2) for LTP 2015-03-19 16:32:03 +09:00
4698bc40c2 implement System V shared memory for LTP syscalls 2015-03-19 16:21:18 +09:00
f5d935b703 support signalfd4 step1 2015-03-18 17:35:43 +09:00
d53865ac5f change to check sequence of kill syscall, check sig num zero after uid checking 2015-03-18 12:59:05 +09:00
8934eb91a4 kill syscall check uid 2015-03-17 15:04:36 +09:00
ed6d94a358 syscall slowdown when repeat fork/exit/wait (LTP fork13) 2015-03-11 16:09:59 +09:00
fa923da0e3 add host PTE cleaning to execve(). refs #377
This removes a cause of LTP gethostid01's wrong behavior.
2015-03-10 18:23:50 +09:00
1f8265efbc check _PAGE_PWT and _PAGE_PCD directly instead of _PAGE_CACHE_WC 2015-03-07 02:12:48 +09:00
b553de7435 supports PTRACE_GETREGSET, PTRACE_SETREGSET.
supports PTRACE_GETFPREGS, PTRACE_SETFPREGS.

refs #421
2015-03-06 19:18:32 +09:00
6a82412d64 modify procfs to read inactive thread's files
However, the following files can be read only if the corresponding
thread is in active.
- /proc/<PID>/mem
- /proc/<PID>/task/<TID>/mem

refs #371
2015-03-05 21:41:24 +09:00
fa29c34995 expand the size of kstack 12 KiB
When a procfs file belonging to a process which was in PS_TRACED status
was accessed, calling kprintf() from process_procfs_request() caused
stack overrun, and x86_cpu_local_variables was destroyed.
2015-03-05 20:30:33 +09:00
f84b5acf79 map entire buffer to read procfs
Reading data from procfs file more than 4096 byte caused a buffer
overrun in McKernel because the buffer was always mapped in McKernel
4096 byte regardless of actual buffer size.
2015-03-05 20:30:33 +09:00
8b24f60861 Combine range and memobj flags before arch_vrflag_to_ptattr() 2015-03-05 16:40:14 +09:00
f82bb284bb Make pager and devobj debug messages optional 2015-03-05 16:03:21 +09:00
bf12a5c45e Introduction of write-combined memory type mappings.
Introduction of VR_WRITE_COMBINED, PTATTR_WRITE_COMBINED and modification
to the memobj's get_page() interface so that Linux communicates back mapping
flags (such as write-combined).
2015-03-05 16:03:21 +09:00
ea5681232e x86 Page Attribute Table (PAT) MSR support.
Reconfigure PAT to permit write-combining memory type to be assigned
on a page-by-page basis. Changes PWT and PCD bit combinations in page
table entries so that they correspond to the following format:

  PAT
  |PCD
  ||PWT
  |||
  000 WB  Write Back (WB)
  001 WC  Write Combining (WC)
  010 UC- Uncached (UC-)
  011 UC  Uncacheable (UC)
2015-03-05 16:03:20 +09:00
e6011be1af create area for to save fp regs
refs #421
2015-03-05 12:18:46 +09:00
9946ccd6b1 pipe free fork is implemented (LTP fork09) 2015-03-04 17:40:58 +09:00
daec7de828 implement /proc/stat
only for sysconf(_SC_NPROCESSORS_ONLN).  This enables Intel OpenMP
runtime to arrange threads with regard for CPU topology.

refs #291
2015-03-04 15:46:53 +09:00
9ad48083aa make PTRACE_POKETEXT use patch_process_vm() 2015-03-04 12:04:54 +09:00
2eac58aab3 add patch_process_vm(). (in progress)
This function patches specified range of specified user space even if
the range is not writable.

refs #401
2015-03-04 12:00:51 +09:00
22d8d169b6 change copy-out routines
- restrict copy_to_user() to only current process.
- add write_process_vm() to write specified process space.
2015-03-04 11:29:16 +09:00
8db54c2637 make GPE on CPL0 cause panic 2015-03-04 11:29:16 +09:00
063fa963c3 change copy-in routines
- restrict copy_from_user() to only current process.
- add read_process_vm() to read specified process space.
2015-03-04 11:29:15 +09:00
a6488adcc1 change parameter type of ihk_mc_pt_virt_to_phys()
- add type qualifier 'const' to virtual address parameter.
  that is, change parameter 'virt' from       'void *'
                                     to 'const void *'
2015-03-04 11:29:15 +09:00
2239a6b09b modify page_fault_process()
- change its argument from 'struct process *'
                        to 'struct process_vm *'.
- change its name from 'page_fault_process()'
                    to 'page_fault_process_vm()'.
- allow to resolve a fault on the process_vm of another process.
2015-03-04 11:29:15 +09:00
8c179d506a support PTRACE_ARCH_PRCTL.
refs #420
2015-03-03 14:22:57 +09:00
377341ce5f change debug output in debug/int3 handler, for struct x86_user_context. 2015-03-03 14:06:30 +09:00
8caeba7cba support PTRACE_GETSIGINFO and PTRACE_SETSIGINFO
refs #422
2015-03-03 09:54:57 +09:00
1d2f5d9893 set is_gpr_valid to initial user context 2015-02-27 14:47:43 +09:00
e4f47df3c3 initialize pstate, turbo mode and power/performace bias MSR registers
MSR_IA32_MISC_ENABLE, MSR_IA32_PERF_CTL and MSR_IA32_ENERGY_PERF_BIAS
are responsible for performance settings, this change enables McKernel
to perform on par with Linux when running the fwq benchmark.
2015-02-27 11:29:11 +09:00
4751055ee4 make ptrace(2) use lookup_user_context() 2015-02-26 17:43:10 +09:00
305ebfed0e add lookup_user_context(). refs #420 2015-02-26 17:43:10 +09:00
b66b950129 add x86_sregs into x86_user_context
x86_sregs contains the registers which are included in user_regs_struct
but not included in x86_basic_regs.
2015-02-26 17:43:10 +09:00
4aa8ba2eef sort x86_basic_regs into user_regs_struct's order 2015-02-26 17:43:10 +09:00
fab2c2aa97 wrap x86_regs with x86_user_context
and, rename x86_regs to x86_basic_regs.
2015-02-26 17:43:10 +09:00
026164eda4 fix PTRACE_ATTACH, PTRACE_DETACH, detach at tracer process terminated.
tracee process may have no parent, increment/decrement refcount.

refs #374
refs #280
2015-02-25 21:09:44 +09:00
e91d1e5b7b stack of signal handler is not 16 byte align
refs #429
2015-02-24 17:20:52 +09:00
73743eeeb0 temporary fix for waiting tracee blocked 2015-02-24 15:20:32 +09:00
c1c1fd578a modify file path of /proc files
LTP getsid02 mount06 msgctl08 msgget03 pause02 pipe07 readhead02
    swapon03 sysconf01 wait402
2015-02-24 11:33:49 +09:00
f35cc66d18 delete unused argument "ctx" from do_syscall
support waitid option "WNOWAIT"
2015-02-23 17:14:14 +09:00
d9cf1d49b1 support waitid
send SIGCHLD to parent when SIGSTOP or SIGCONT received

refs #425
refs #283
2015-02-22 20:05:30 +09:00
3d426ada01 use remap_pfn_range() in rus_vm_fault() for kernel versions newer than 3.0 2015-02-19 13:52:55 -08:00
0307f6a6cc impementation of sched_{setparam, getparam, setscheduler, getscheduler, get_priority_min, get_priority_max, rr_get_interval} system calls 2015-02-19 11:46:03 -08:00
0dee04f16b move parse_args() to after arch_init()
In attached-mic, bootparam is not mapped until arch_init() is finished.
In builtin-mic and builtin-x86, virtual address of bootparam is changed
in arch_init().
2015-02-18 20:49:46 +09:00
0e98e87b95 change type of kprintf_lock() to "unsigned long"
to match type of ihk_mc_spinlock_lock().
2015-02-18 20:49:46 +09:00
d35e60c1a3 add init_boot_processor_local() for arch_start() 2015-02-18 20:49:46 +09:00
037e17c4ed fix parsing of "osnum=" kargs 2015-02-18 16:44:14 +09:00
2baf274dac fix PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK and PTRACE_O_TRACECLONE.
allocate debug registers area, for new process.
(gdb testsuite gdb.base/inferior-died.exp)

refs #266
refs #372
2015-02-18 16:20:23 +09:00
3b04043f2a change to throw signal SIGILL to SIGSEGV when GPE 2015-02-18 14:54:49 +09:00
c0edb6fe6f add new cpu state CPU_STATUS_RESERVED 2015-02-18 13:46:08 +09:00
bb137bc9bb make brk region just follow data region
This effectively reverts commit d70dd2338c.
2015-02-18 11:52:15 +09:00
16af976a71 support msync() system call. refs #382
Msync(2) of this version writes only the pages which the calling process
modified. Modifications of the other processes are not written.
2015-02-18 11:52:15 +09:00
6485578a7f sched_yield implementation 2015-02-17 16:20:51 -08:00
d2d0fc6721 The mcexec command became executable from a command-line at the same time 2015-02-17 18:33:38 +09:00
9574a28a5f The same CPU is assigned to a different process.
refs #423
2015-02-17 18:27:46 +09:00
dbe4ec3247 support PTRACE_O_TRACECLONE and PTRACE_O_TRACEEXEC. 2015-02-17 17:00:48 +09:00
99debc548f detach traced process, when tracer process terminate.
some fixes on PTRACE_DETACH.

refs #374
refs #280
2015-02-17 16:58:29 +09:00
fa15f6b106 support PTRACE_SYSCALL.
support PTRACE_O_TRACESYSGOOD.
ptrace_report_exec() calls ptrace_report_signal().

refs #265
2015-02-17 16:56:27 +09:00
8568a73f33 traced process should stop by any signal except for SIGKILL,
even if SIG_IGN.  (LTP ptrace01)
2015-02-17 16:51:29 +09:00
8b57b2ee57 change signal handling at mcexec 2015-02-15 17:54:11 +09:00
9a36e7b84a restart waitpid if it returns with EINTR. 2015-02-13 16:00:40 +09:00
d998691425 fix setpgid(0, 0) 2015-02-13 13:51:00 +09:00
8cdf70c500 Enable AVX extensions for processors that support it. 2015-02-12 17:51:50 -08:00
0e0bc548f6 fix mcexec SIG_IGN 2015-02-12 19:02:58 +09:00
d21ae28843 add dummy NUMA system calls. refs #405
ENOSYS system call handlers for the following.
- get_mempolicy()
- mbind()
- migrate_pages()
- move_pages()
- set_mempolicy()
2015-02-10 21:16:19 +09:00
a4a806bef7 support vsyscall_getcpu() vsyscall. refs #385
This version simply calls getcpu() system call, so that it's not fast.
2015-02-10 18:35:48 +09:00
d30d8fe71c support getcpu() system call. refs #385
It appeared on Linux(x86) in kernel 3.1.
2015-02-10 18:35:41 +09:00
a5bdd41c3d procfs: check parent entry to avoid page fault in procfs_exit() 2015-01-31 22:27:13 -08:00
5f5ab34559 support PTRACE_ATTACH.
fix PTRACE_TRACEME, PTRACE_DETACH.
2015-01-30 21:02:01 +09:00
b26fa4e87c wrong send signal to sender process when kill other process group (LTP kill10)
refs #404
2015-01-29 16:14:31 +09:00
bd5f43b119 support PTRACE_SINGLESTEP.
support debug/int3 exception.
2015-01-29 15:48:05 +09:00
f97f8dbab3 support PTRACE_PEEKTEXT and PTRACE_PEEKDATA.
support PTRACE_POKETEXT and PTRACE_POKEDATA.
  now, force write anywhere.
  read-only page must copy-on-write.
2015-01-29 15:02:15 +09:00
e30946f1f0 fix PTRACE_CONT may cause error.
refs #369
2015-01-29 14:10:31 +09:00
c3ade864d9 fix PTRACE_PEEKUSER, PTRACE_POKEUSER, PTRACE_GETREGS.
support PTRACE_SETREGS.
  In struct process, add 'unsigned long *ptrace_debugreg', instead of 'struct user *userp'.
  debug registers are read/written from/to ptrace_debugreg, save/restore in schedule().
  most general registers are proc->uctx.
  fs_base is proc->thread.tlsblock_base.
  gs_base,ds,es,fs,gs and orig_rax are uncompleted.
  other members in 'struct user' are ignored, same as Linux implementation.

refs #257
refs #373
refs #263
2015-01-29 14:08:38 +09:00
9c35935671 mcexec: fix memory allocation bug that crashes CentOS7 glibc 2015-01-27 16:55:30 +09:00
ed33ee65b2 CentOS7 spinlock, procfs and vm_munmap support (i.e., Linux kernel 3.10) 2015-01-27 16:55:28 +09:00
d04b5a09bd PTRACE_KILL omit sched_wakeup_process return
refs #369
2015-01-27 10:55:49 +09:00
08cc31f9bf support setrlimits/getrlimits, however this fix is these syscalls only.
checking resource process must implement it separately.

refs #330
2015-01-27 10:35:58 +09:00
cf2166f830 function enter_user_mode calls check_signal.
refs #392
2015-01-16 14:28:28 +09:00
765de119dc support PTRACE_O_TRACEFORK, PTRACE_O_TRACEVFORK, PTRACE_O_TRACEVFORKDONE.
to start with a SIGSTOP, do not set proc->ftn->status to PS_RUNNING in __runq_add_proc().
  change vfork() set CLONE_VFORK.

refs #266
refs #267
refs #372

support PTRACE_GETEVENTMSG.
  to store ptrace event, add 'unsigned long ptrace_eventmsg;' member in struct fork_tree_node.

refs #273
2015-01-14 10:43:18 +09:00
d46110b4d9 support PTRACE_DETACH.
change getppid() to use proc->ftn->ppid_parent->pid, for ptraced process.

refs #280
2015-01-08 12:39:52 +09:00
74f0aec478 skip copy_to_user() when r->ret is negative error number in mckernel_procfs_read().
refs #370
2015-01-08 12:38:06 +09:00
e3eb7e68bc Fix need to modify ihk/cokernel/Makefile when a file has been added under mckernel/arch (Bug#365) 2014-12-26 16:05:23 +09:00
912b8a886c do_kill distinguish PTRACE_CONT from kill. 2014-12-26 15:23:11 +09:00
e25d35a191 ihk_mc_init_ap(): cosmetics for reporting IKC, trampoline info 2014-12-25 11:05:52 +09:00
a9aad67541 IHK-SMP: boot scripts placeholder 2014-12-25 11:03:07 +09:00
cd6e663f48 handle VM_RESERVED (non-existing since Linux 3.7.0) and do_mmap_pgoff() (unexported since Linux 3.5.0) in mcctrl's syscall.c 2014-12-25 11:03:05 +09:00
5f095b3952 McKernel IHK SMP-x86 support (build system and config files) 2014-12-25 11:03:04 +09:00
811a275176 build scripts: support for separate build and source directories 2014-12-25 11:03:03 +09:00
b388f59ebd ihk_ikc_irq and ihk_ikc_irq_apicid 2014-12-25 11:03:01 +09:00
ff47261337 receive trampoline addr via parameter of arch_start() 2014-12-25 11:03:00 +09:00
a91bf9a13d ptrace: Make PTRACE_CONT/KILL debug print separated. 2014-12-24 12:39:29 +09:00
fcfa94cea1 ptrace: Add PTRACE_O_TRACEFORK (fake) support. 2014-12-24 12:39:13 +09:00
55f7ee1526 fix a warning
| mckernel/kernel/../arch/x86/kernel/memory.c: In function '__set_pt_page':
| mckernel/kernel/../arch/x86/kernel/memory.c:367:
|     warning: 'init_pt_lock_flags' may be used uninitialized in this function
2014-12-22 17:03:32 +09:00
b1b6fab7b8 fix a warning
| mckernel/kernel/host.c: In function 'syscall_packet_handler':
| mckernel/kernel/host.c:504:
|     warning: implicit declaration of function 'find_command_line'
2014-12-22 16:58:08 +09:00
391886a6f1 fix a warning
| mckernel/kernel/syscall.c: In function 'do_syscall':
| mckernel/kernel/syscall.c:187:
|     warning: 'irqstate' may be used uninitialized in this function
2014-12-22 16:58:07 +09:00
c810afe224 fix a warning
| mckernel/kernel/syscall.c: In function 'sys_madvise':
| mckernel/kernel/syscall.c:2108:
|     warning: 'range' may be used uninitialized in this function
2014-12-22 16:58:06 +09:00
5566ed1a63 fix a warning
| mckernel/executer/kernel/control.c: In function ‘release_handler’:
| mckernel/executer/kernel/control.c:264: warning: unused variable ‘c’
2014-12-22 16:58:05 +09:00
f0f91d2246 fix a warning
| mckernel/executer/kernel/control.c: In function ‘mcexec_debug_log’:
| mckernel/executer/kernel/control.c:252: warning: unused variable ‘c’
2014-12-22 16:58:04 +09:00
0942bf0ce0 make dkprintf() evaluate its parameters always
Parameters of dkprintf() should be evaluated even if dkprintf() is
disabled.  Because this enables to find expression of parameter obsolete
and to avoid unnecessary compiler warnings such as "unused variable".
2014-12-22 16:58:03 +09:00
9c94e90007 use ftn->tid instead of proc->tid 2014-12-22 16:58:02 +09:00
a6ac906105 use ftn->pid instead of proc->pid 2014-12-22 16:58:01 +09:00
d4ba4dc8b3 introduction of mckernel_procfs_file_operations; fix /proc/self path resolution;
implementation of /proc/self/pagemap (LTP mmap12)
2014-12-15 12:46:05 +09:00
815d907ca4 setpgid return -EACCES when the child process had already performed an execve (LTP setpgid03) 2014-12-09 14:01:20 +09:00
3c24315f91 support for /proc/mcos%d/PID/maps (without file info) (LTP mlock03) 2014-12-05 16:29:20 +09:00
25f108bf78 mckernel_procfs_read(): fix buffer allocation, offset check and return code 2014-12-05 16:27:48 +09:00
cc9d30efbf do_signal(): support for SIGSYS
as of POSIX.1-2001:
Signal  Value       Action  Comment
---------------------------------------------------
SIGSYS  12,31,12    Core    Bad argument to routine
2014-12-04 18:10:10 +09:00
af83f1be64 rlimit(RLIMIT_NOFILE): return one less to make sure sync pipe can be created (LTP fork09) 2014-12-04 17:40:00 +09:00
b2cab453f1 clone(): do not allow setting CLONE_THREAD and CLONE_VM separately
XXX: When CLONE_VM is set but CLONE_THREAD is not the new thread is
meant to have its own thread group, i.e., when calling exit_group()
the cloner thread wouldn't be killed. However, this is a problem on
the Linux side because we do not invoke clone in mcexec when threads
are created. Thus, currently no support for this combination is
provided.
2014-12-04 16:55:18 +09:00
8909597499 clone(): support for handling CLONE_SIGHAND and CLONE_VM flags separately 2014-12-04 16:55:17 +09:00
86f2a9067b getppid() implementation 2014-12-04 16:55:17 +09:00
a5889fb5df sigaction check signal number (LTP sigaction02) 2014-12-04 11:31:50 +09:00
f1a86cfbd3 when host mcexec down, syscall is hung up 2014-12-04 11:17:29 +09:00
c1cf630a94 mcexec: store full path to executable
required so that a forked process can obtain exec reference in the
Linux kernel even if executable was specified with relative path
and fork was called after changing the current working directory
2014-12-03 15:14:26 +09:00
8f30e16976 when mcexec is killed by SIGKILL, terminate mckernel process (BUG#259) 2014-11-27 16:13:52 +09:00
58e2e0a246 Use pidof in mcreboot script 2014-11-23 17:54:14 +09:00
ea02628f2b Add reboot and shutdown script for builtin-x86
It decides the number of cores for McKernel by looking into the
"SHIMOS: CPU Status:" line of dmesg. It sets the amount of memory for
McKernel to one urth of the total memory obtained by "free -g".
2014-11-13 20:06:29 +09:00
89acf5c5d6 support for AT_RANDOM auxiliary entry on the process stack (needed for _dl_random in glibc) 2014-11-11 08:48:27 +09:00
ac8e2a0c40 handle VM_RESERVED (non-existing since Linux 3.7.0) and do_mmap_pgoff() (unexported since Linux 3.5.0) in mcctrl's syscall.c 2014-11-11 08:42:07 +09:00
ab7aa3354f repair signal implementation.
- Don't intrrupt syscall with the ignored signal.
2014-11-07 07:55:30 +09:00
c4e0b84792 repair signal implementation.
- can not interrupt syscall
- can not recieve SIGKILL
2014-10-31 16:34:59 +09:00
158 changed files with 45158 additions and 9144 deletions

View File

@ -1,12 +1,14 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@(cd executer/kernel; make modules)
@(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make) \
;; \
*) \
@ -16,10 +18,11 @@ all::
esac
install::
@(cd executer/kernel; make install)
@(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make install) \
;; \
*) \
@ -27,19 +30,43 @@ install::
exit 1 \
;; \
esac
if [ "$(TARGET)" = attached-mic ]; then \
@case "$(TARGET)" in \
attached-mic) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-attached-mic.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-attached-mic.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
fi
;; \
builtin-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-builtin-x86.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-builtin-x86.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
smp-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
exit 1 \
;; \
esac
clean::
@(cd executer/kernel; make clean)
@(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic) \
attached-mic | builtin-x86 | builtin-mic | smp-x86) \
(cd kernel; make clean) \
;; \
*) \

View File

@ -0,0 +1,2 @@
IHK_OBJS += cpu.o interrupt.o memory.o trampoline.o local.o context.o
IHK_OBJS += perfctr.o syscall.o vsyscall.o

View File

@ -10,7 +10,7 @@
* HISTORY
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...)
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
@ -78,15 +78,16 @@ int get_prstatus_size(void)
* \brief Fill a prstatus structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs0 A pointer to a x86_regs structure.
*/
void fill_prstatus(struct note *head, struct process *proc, void *regs0)
void fill_prstatus(struct note *head, struct thread *thread, void *regs0)
{
void *name;
struct elf_prstatus64 *prstatus;
struct x86_regs *regs = regs0;
struct x86_user_context *uctx = regs0;
struct x86_basic_regs *regs = &uctx->gpr;
register unsigned long _r12 asm("r12");
register unsigned long _r13 asm("r13");
register unsigned long _r14 asm("r14");
@ -159,11 +160,11 @@ int get_prpsinfo_size(void)
* \brief Fill a prpsinfo structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
void fill_prpsinfo(struct note *head, struct thread *thread, void *regs)
{
void *name;
struct elf_prpsinfo64 *prpsinfo;
@ -175,8 +176,8 @@ void fill_prpsinfo(struct note *head, struct process *proc, void *regs)
memcpy(name, "CORE", sizeof("CORE"));
prpsinfo = (struct elf_prpsinfo64 *)(name + align32(sizeof("CORE")));
prpsinfo->pr_state = proc->ftn->status;
prpsinfo->pr_pid = proc->ftn->pid;
prpsinfo->pr_state = thread->status;
prpsinfo->pr_pid = thread->proc->pid;
/*
We leave most of the fields unfilled.
@ -209,11 +210,11 @@ int get_auxv_size(void)
* \brief Fill an AUXV structure.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_auxv(struct note *head, struct process *proc, void *regs)
void fill_auxv(struct note *head, struct thread *thread, void *regs)
{
void *name;
void *auxv;
@ -224,7 +225,7 @@ void fill_auxv(struct note *head, struct process *proc, void *regs)
name = (void *) (head + 1);
memcpy(name, "CORE", sizeof("CORE"));
auxv = name + align32(sizeof("CORE"));
memcpy(auxv, proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
memcpy(auxv, thread->proc->saved_auxv, sizeof(unsigned long) * AUXV_LEN);
}
/**
@ -242,23 +243,23 @@ int get_note_size(void)
* \brief Fill the NOTE segment.
*
* \param head A pointer to a note structure.
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
*/
void fill_note(void *note, struct process *proc, void *regs)
void fill_note(void *note, struct thread *thread, void *regs)
{
fill_prstatus(note, proc, regs);
fill_prstatus(note, thread, regs);
note += get_prstatus_size();
fill_prpsinfo(note, proc, regs);
fill_prpsinfo(note, thread, regs);
note += get_prpsinfo_size();
fill_auxv(note, proc, regs);
fill_auxv(note, thread, regs);
}
/**
* \brief Generate an image of the core file.
*
* \param proc A pointer to the current process structure.
* \param thread A pointer to the current thread structure.
* \param regs A pointer to a x86_regs structure.
* \param coretable(out) An array of core chunks.
* \param chunks(out) Number of the entires of coretable.
@ -270,7 +271,18 @@ void fill_note(void *note, struct process *proc, void *regs)
* should be zero.
*/
int gencore(struct process *proc, void *regs,
/*@
@ requires \valid(thread);
@ requires \valid(regs);
@ requires \valid(coretable);
@ requires \valid(chunks);
@ behavior success:
@ ensures \result == 0;
@ assigns coretable;
@ behavior failure:
@ ensures \result == -1;
@*/
int gencore(struct thread *thread, void *regs,
struct coretable **coretable, int *chunks)
{
struct coretable *ct = NULL;
@ -278,7 +290,7 @@ int gencore(struct process *proc, void *regs,
Elf64_Phdr *ph = NULL;
void *note = NULL;
struct vm_range *range;
struct process_vm *vm = proc->vm;
struct process_vm *vm = thread->vm;
int segs = 1; /* the first one is for NOTE */
int notesize, phsize, alignednotesize;
unsigned int offset = 0;
@ -305,7 +317,7 @@ int gencore(struct process *proc, void *regs,
unsigned long p, phys;
int prevzero = 0;
for (p = range->start; p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
prevzero = 1;
} else {
@ -325,7 +337,7 @@ int gencore(struct process *proc, void *regs,
dkprintf("we have %d segs and %d chunks.\n\n", segs, *chunks);
{
struct vm_regions region = proc->vm->region;
struct vm_regions region = thread->vm->region;
dkprintf("text: %lx-%lx\n", region.text_start, region.text_end);
dkprintf("data: %lx-%lx\n", region.data_start, region.data_end);
@ -363,7 +375,7 @@ int gencore(struct process *proc, void *regs,
goto fail;
}
memset(note, 0, alignednotesize);
fill_note(note, proc, regs);
fill_note(note, thread, regs);
/* prgram header for NOTE segment is exceptional */
ph[0].p_type = PT_NOTE;
@ -433,7 +445,7 @@ int gencore(struct process *proc, void *regs,
for (start = p = range->start;
p < range->end; p += PAGE_SIZE) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)p, &phys) != 0) {
if (prevzero == 0) {
/* We begin a new chunk */
@ -471,9 +483,9 @@ int gencore(struct process *proc, void *regs,
i++;
}
} else {
if ((proc->vm->region.user_start <= range->start) &&
(range->end <= proc->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(proc->vm->page_table,
if ((thread->vm->region.user_start <= range->start) &&
(range->end <= thread->vm->region.user_end)) {
if (ihk_mc_pt_virt_to_phys(thread->vm->address_space->page_table,
(void *)range->start, &phys) != 0) {
dkprintf("could not convert user virtual address %lx"
"to physical address", range->start);
@ -509,6 +521,10 @@ int gencore(struct process *proc, void *regs,
* \param coretable An array of core chunks.
*/
/*@
@ requires \valid(coretable);
@ assigns \nothing;
@*/
void freecore(struct coretable **coretable)
{
struct coretable *ct = *coretable;

View File

@ -0,0 +1,98 @@
/**
* \file arch-bitops.h
* License details are found in the file LICENSE.
* \brief
* Find last set bit in word.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
*/
/*
* HISTORY
*/
#ifndef HEADER_X86_COMMON_ARCH_BITOPS_H
#define HEADER_X86_COMMON_ARCH_BITOPS_H
#define ARCH_HAS_FAST_MULTIPLIER 1
static inline int fls(int x)
{
int r;
asm("bsrl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* ffs - find first set bit in word
* @x: the word to search
*
* This is defined the same way as the libc and compiler builtin ffs
* routines, therefore differs in spirit from the other bitops.
*
* ffs(value) returns 0 if value is 0 or the position of the first
* set bit if value is nonzero. The first (least significant) bit
* is at position 1.
*/
static inline int ffs(int x)
{
int r;
asm("bsfl %1,%0\n\t"
"jnz 1f\n\t"
"movl $-1,%0\n"
"1:" : "=r" (r) : "rm" (x));
return r + 1;
}
/**
* __ffs - find first set bit in word
* @word: The word to search
*
* Undefined if no bit exists, so code should check against 0 first.
*/
static inline unsigned long __ffs(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "rm" (word));
return word;
}
/**
* ffz - find first zero bit in word
* @word: The word to search
*
* Undefined if no zero exists, so code should check against ~0UL first.
*/
static inline unsigned long ffz(unsigned long word)
{
asm("bsf %1,%0"
: "=r" (word)
: "r" (~word));
return word;
}
#define ADDR (*(volatile long *)addr)
static inline void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btsl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
static inline void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("lock; btrl %1,%0"
: "+m" (ADDR)
: "Ir" (nr)
: "memory");
}
#endif

View File

@ -0,0 +1,67 @@
/**
* \file futex.h
* Licence details are found in the file LICENSE.
*
* \brief
* Futex adaptation to McKernel
*
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
*
*
* HISTORY:
*
*/
#ifndef _ARCH_FUTEX_H
#define _ARCH_FUTEX_H
#include <asm.h>
#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\t" insn "\n" \
"2:\t.section .fixup,\"ax\"\n" \
"3:\tmov\t%3, %1\n" \
"\tjmp\t2b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
: "i" (-EFAULT), "0" (oparg), "1" (0))
#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
asm volatile("1:\tmovl %2, %0\n" \
"\tmovl\t%0, %3\n" \
"\t" insn "\n" \
"2:\tlock; cmpxchgl %3, %2\n" \
"\tjnz\t1b\n" \
"3:\t.section .fixup,\"ax\"\n" \
"4:\tmov\t%5, %1\n" \
"\tjmp\t3b\n" \
"\t.previous\n" \
_ASM_EXTABLE(1b, 4b) \
_ASM_EXTABLE(2b, 4b) \
: "=&a" (oldval), "=&r" (ret), \
"+m" (*uaddr), "=&r" (tem) \
: "r" (oparg), "i" (-EFAULT), "1" (0))
static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
int newval)
{
#ifdef __UACCESS__
if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
return -EFAULT;
#endif
asm volatile("1:\tlock; cmpxchgl %3, %1\n"
"2:\t.section .fixup, \"ax\"\n"
"3:\tmov %2, %0\n"
"\tjmp 2b\n"
"\t.previous\n"
_ASM_EXTABLE(1b, 3b)
: "=a" (oldval), "+m" (*uaddr)
: "i" (-EFAULT), "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -5,15 +5,20 @@
#define __HEADER_X86_COMMON_ARCH_LOCK
#include <ihk/cpu.h>
#include <ihk/atomic.h>
//#define DEBUG_SPINLOCK
//#define DEBUG_MCS_RWLOCK
#ifdef DEBUG_SPINLOCK
#if defined(DEBUG_SPINLOCK) || defined(DEBUG_MCS_RWLOCK)
int __kprintf(const char *format, ...);
#endif
typedef int ihk_spinlock_t;
extern void preempt_enable(void);
extern void preempt_disable(void);
#define IHK_STATIC_SPINLOCK_FUNCS
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
@ -22,7 +27,17 @@ static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
}
#define SPIN_LOCK_UNLOCKED 0
static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_lock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
#endif
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
@ -41,10 +56,8 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
: "+Q" (inc), "+m" (*lock), "=r" (tmp) : : "memory", "cc");
#endif
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] trying to grab lock: 0x%lX\n",
ihk_mc_get_processor_id(), lock);
#endif
preempt_disable();
asm volatile("lock; xaddl %0, %1\n"
"movzwl %w0, %2\n\t"
"shrl $16, %0\n\t"
@ -60,36 +73,455 @@ static void ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
:
: "memory", "cc");
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] holding lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
#endif
}
static unsigned long ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock(l) ({ unsigned long rc;\
__kprintf("[%d] call ihk_mc_spinlock_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
rc = __ihk_mc_spinlock_lock(l);\
__kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
})
#else
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
#endif
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
{
unsigned long flags;
flags = cpu_disable_interrupt_save();
ihk_mc_spinlock_lock_noirq(lock);
__ihk_mc_spinlock_lock_noirq(lock);
return flags;
}
static void ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock_noirq(l) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock_noirq(l); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
#endif
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
{
asm volatile ("lock incw %0" : "+m"(*lock) : : "memory", "cc");
preempt_enable();
}
static void ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_unlock(l, f) { \
__kprintf("[%d] call ihk_mc_spinlock_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__ihk_mc_spinlock_unlock((l), (f)); \
__kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
#endif
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
{
ihk_mc_spinlock_unlock_noirq(lock);
__ihk_mc_spinlock_unlock_noirq(lock);
cpu_restore_interrupt(flags);
#ifdef DEBUG_SPINLOCK
__kprintf("[%d] released lock: 0x%lX\n", ihk_mc_get_processor_id(), lock);
}
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
} __attribute__((aligned(64))) mcs_lock_node_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;
node->next = NULL;
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
node->next = NULL;
node->locked = 0;
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
(unsigned long)node);
if (pred) {
node->locked = 1;
pred->next = node;
while (node->locked != 0) {
cpu_pause();
}
}
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
struct mcs_lock_node *old = (struct mcs_lock_node *)
atomic_cmpxchg8((unsigned long *)&lock->next,
(unsigned long)node, (unsigned long)0);
if (old == node) {
return;
}
while (node->next == NULL) {
cpu_pause();
}
}
node->next->locked = 0;
}
// reader/writer lock
typedef struct mcs_rwlock_node {
ihk_atomic_t count; // num of readers (use only common reader)
char type; // lock type
#define MCS_RWLOCK_TYPE_COMMON_READER 0
#define MCS_RWLOCK_TYPE_READER 1
#define MCS_RWLOCK_TYPE_WRITER 2
char locked; // lock
#define MCS_RWLOCK_LOCKED 1
#define MCS_RWLOCK_UNLOCKED 0
char dmy1; // unused
char dmy2; // unused
struct mcs_rwlock_node *next;
} __attribute__((aligned(64))) mcs_rwlock_node_t;
typedef struct mcs_rwlock_node_irqsave {
struct mcs_rwlock_node node;
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_rwlock_node_irqsave_t;
typedef struct mcs_rwlock_lock {
struct mcs_rwlock_node reader; /* common reader lock */
struct mcs_rwlock_node *node; /* base */
} __attribute__((aligned(64))) mcs_rwlock_lock_t;
static void
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
{
ihk_atomic_set(&lock->reader.count, 0);
lock->reader.type = MCS_RWLOCK_TYPE_COMMON_READER;
lock->node = NULL;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
#endif
static void
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_WRITER;
node->next = NULL;
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
}
static void
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
{
struct mcs_rwlock_node *p;
struct mcs_rwlock_node *f = NULL;
struct mcs_rwlock_node *n;
int breakf = 0;
ihk_atomic_inc(&lock->reader.count); // protect to unlock reader
for(p = &lock->reader; p->next; p = n){
n = p->next;
if(p->next->type == MCS_RWLOCK_TYPE_READER){
p->next = n->next;
if(lock->node == n){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)n,
(unsigned long)p);
if(old != n){ // couldn't change
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
else{
breakf = 1;
}
}
else if(p->next == NULL){
while (n->next == NULL) {
cpu_pause();
}
p->next = n->next;
}
if(f){
ihk_atomic_inc(&lock->reader.count);
n->locked = MCS_RWLOCK_UNLOCKED;
}
else
f = n;
n = p;
if(breakf)
break;
}
if(n->next == NULL && lock->node != n){
while (n->next == NULL && lock->node != n) {
cpu_pause();
}
}
}
f->locked = MCS_RWLOCK_UNLOCKED;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
#endif
static void
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if (node->next == NULL) {
struct mcs_rwlock_node *old = (struct mcs_rwlock_node *)
atomic_cmpxchg8((unsigned long *)&lock->node,
(unsigned long)node, (unsigned long)0);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
}
if(node->next->type == MCS_RWLOCK_TYPE_READER){
lock->reader.next = node->next;
mcs_rwlock_unlock_readers(lock);
}
else{
node->next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock_noirq __mcs_rwlock_reader_lock_noirq
#endif
static inline unsigned int
atomic_inc_ifnot0(ihk_atomic_t *v)
{
unsigned int *p = (unsigned int *)(&(v)->counter);
unsigned int old;
unsigned int new;
unsigned int val;
do{
if(!(old = *p))
break;
new = old + 1;
val = atomic_cmpxchg4(p, old, new);
}while(val != old);
return old;
}
static void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
struct mcs_rwlock_node *pred;
preempt_disable();
node->type = MCS_RWLOCK_TYPE_READER;
node->next = NULL;
node->dmy1 = ihk_mc_get_processor_id();
pred = (struct mcs_rwlock_node *)xchg8((unsigned long *)&lock->node,
(unsigned long)node);
if (pred) {
if(pred == &lock->reader){
if(atomic_inc_ifnot0(&pred->count)){
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)node,
(unsigned long)pred);
if (old == node) {
goto out;
}
while (node->next == NULL) {
cpu_pause();
}
node->locked = MCS_RWLOCK_LOCKED;
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
ihk_atomic_dec(&pred->count);
goto out;
}
}
node->locked = MCS_RWLOCK_LOCKED;
pred->next = node;
while (node->locked != MCS_RWLOCK_UNLOCKED) {
cpu_pause();
}
}
else {
lock->reader.next = node;
mcs_rwlock_unlock_readers(lock);
}
out:
return;
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock_noirq(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock_noirq %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock_noirq((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
#endif
static void
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
if(ihk_atomic_dec_return(&lock->reader.count))
goto out;
if (lock->reader.next == NULL) {
struct mcs_rwlock_node *old;
old = (struct mcs_rwlock_node *)atomic_cmpxchg8(
(unsigned long *)&lock->node,
(unsigned long)&lock->reader,
(unsigned long)0);
if (old == &lock->reader) {
goto out;
}
while (lock->reader.next == NULL) {
cpu_pause();
}
}
if(lock->reader.next->type == MCS_RWLOCK_TYPE_READER){
mcs_rwlock_unlock_readers(lock);
}
else{
lock->reader.next->locked = MCS_RWLOCK_UNLOCKED;
}
out:
preempt_enable();
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
#endif
static void
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_writer_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_writer_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_writer_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_writer_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
#endif
static void
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_writer_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_lock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_lock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_lock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
#endif
static void
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
node->irqsave = cpu_disable_interrupt_save();
__mcs_rwlock_reader_lock_noirq(lock, &node->node);
}
#ifdef DEBUG_MCS_RWLOCK
#define mcs_rwlock_reader_unlock(l, n) { \
__kprintf("[%d] call mcs_rwlock_reader_unlock %p %s:%d\n", ihk_mc_get_processor_id(), (l), __FILE__, __LINE__); \
__mcs_rwlock_reader_unlock((l), (n)); \
__kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
}
#else
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
#endif
static void
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
__mcs_rwlock_reader_unlock_noirq(lock, &node->node);
cpu_restore_interrupt(node->irqsave);
}
#endif

View File

@ -5,6 +5,8 @@
* Define and declare memory management macros and functions
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -20,6 +22,7 @@
#define USER_CS_ENTRY 6
#define USER_DS_ENTRY 7
#define GLOBAL_TSS_ENTRY 8
#define GETCPU_ENTRY 15
#define KERNEL_CS (KERNEL_CS_ENTRY * 8)
#define KERNEL_DS (KERNEL_DS_ENTRY * 8)
@ -38,10 +41,12 @@
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
#define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
@ -63,6 +68,8 @@
#define PF_PRESENT ((pte_t)0x01) /* entry is valid */
#define PF_WRITABLE ((pte_t)0x02)
#define PFLX_PWT ((pte_t)0x08)
#define PFLX_PCD ((pte_t)0x10)
#define PF_SIZE ((pte_t)0x80) /* entry points large page */
#define PFL4_PRESENT ((pte_t)0x01)
@ -72,8 +79,8 @@
#define PFL3_PRESENT ((pte_t)0x01)
#define PFL3_WRITABLE ((pte_t)0x02)
#define PFL3_USER ((pte_t)0x04)
#define PFL3_PWT ((pte_t)0x08)
#define PFL3_PCD ((pte_t)0x10)
#define PFL3_PWT PFLX_PWT
#define PFL3_PCD PFLX_PCD
#define PFL3_ACCESSED ((pte_t)0x20)
#define PFL3_DIRTY ((pte_t)0x40)
#define PFL3_SIZE ((pte_t)0x80) /* Used in 1G page */
@ -84,8 +91,8 @@
#define PFL2_PRESENT ((pte_t)0x01)
#define PFL2_WRITABLE ((pte_t)0x02)
#define PFL2_USER ((pte_t)0x04)
#define PFL2_PWT ((pte_t)0x08)
#define PFL2_PCD ((pte_t)0x10)
#define PFL2_PWT PFLX_PWT
#define PFL2_PCD PFLX_PCD
#define PFL2_ACCESSED ((pte_t)0x20)
#define PFL2_DIRTY ((pte_t)0x40)
#define PFL2_SIZE ((pte_t)0x80) /* Used in 2M page */
@ -96,8 +103,8 @@
#define PFL1_PRESENT ((pte_t)0x01)
#define PFL1_WRITABLE ((pte_t)0x02)
#define PFL1_USER ((pte_t)0x04)
#define PFL1_PWT ((pte_t)0x08)
#define PFL1_PCD ((pte_t)0x10)
#define PFL1_PWT PFLX_PWT
#define PFL1_PCD PFLX_PCD
#define PFL1_ACCESSED ((pte_t)0x20)
#define PFL1_DIRTY ((pte_t)0x40)
#define PFL1_IGNORED_11 ((pte_t)1 << 11)
@ -117,6 +124,25 @@
#define PTE_NULL ((pte_t)0)
typedef unsigned long pte_t;
/*
* pagemap kernel ABI bits
*/
#define PM_ENTRY_BYTES sizeof(uint64_t)
#define PM_STATUS_BITS 3
#define PM_STATUS_OFFSET (64 - PM_STATUS_BITS)
#define PM_STATUS_MASK (((1LL << PM_STATUS_BITS) - 1) << PM_STATUS_OFFSET)
#define PM_STATUS(nr) (((nr) << PM_STATUS_OFFSET) & PM_STATUS_MASK)
#define PM_PSHIFT_BITS 6
#define PM_PSHIFT_OFFSET (PM_STATUS_OFFSET - PM_PSHIFT_BITS)
#define PM_PSHIFT_MASK (((1LL << PM_PSHIFT_BITS) - 1) << PM_PSHIFT_OFFSET)
#define PM_PSHIFT(x) (((uint64_t) (x) << PM_PSHIFT_OFFSET) & PM_PSHIFT_MASK)
#define PM_PFRAME_MASK ((1LL << PM_PSHIFT_OFFSET) - 1)
#define PM_PFRAME(x) ((x) & PM_PFRAME_MASK)
#define PM_PRESENT PM_STATUS(4LL)
#define PM_SWAP PM_STATUS(2LL)
/* For easy conversion, it is better to be the same as architecture's ones */
enum ihk_mc_pt_attribute {
PTATTR_ACTIVE = 0x01,
@ -128,8 +154,11 @@ enum ihk_mc_pt_attribute {
PTATTR_NO_EXECUTE = 0x8000000000000000,
PTATTR_UNCACHABLE = 0x10000,
PTATTR_FOR_USER = 0x20000,
PTATTR_WRITE_COMBINED = 0x40000,
};
enum ihk_mc_pt_attribute attr_mask;
static inline int pte_is_null(pte_t *ptep)
{
return (*ptep == PTE_NULL);
@ -185,6 +214,33 @@ static inline off_t pte_get_off(pte_t *ptep, size_t pgsize)
return (off_t)(*ptep & PAGE_MASK);
}
static inline enum ihk_mc_pt_attribute pte_get_attr(pte_t *ptep, size_t pgsize)
{
enum ihk_mc_pt_attribute attr;
attr = *ptep & attr_mask;
if (*ptep & PFLX_PWT) {
if (*ptep & PFLX_PCD) {
attr |= PTATTR_UNCACHABLE;
}
else {
attr |= PTATTR_WRITE_COMBINED;
}
}
if (((pgsize == PTL2_SIZE) && (*ptep & PFL2_SIZE))
|| ((pgsize == PTL3_SIZE) && (*ptep & PFL3_SIZE))) {
attr |= PTATTR_LARGEPAGE;
}
return attr;
} /* pte_get_attr() */
static inline void pte_make_null(pte_t *ptep, size_t pgsize)
{
*ptep = PTE_NULL;
return;
}
static inline void pte_make_fileoff(off_t off,
enum ihk_mc_pt_attribute ptattr, size_t pgsize, pte_t *ptep)
{
@ -216,20 +272,51 @@ static inline void pte_xchg(pte_t *ptep, pte_t *valp)
#define pte_xchg(p,vp) do { *(vp) = xchg((p), *(vp)); } while (0)
#endif
static inline void pte_clear_dirty(pte_t *ptep, size_t pgsize)
{
uint64_t mask;
switch (pgsize) {
default: /* through */
case PTL1_SIZE: mask = ~PFL1_DIRTY; break;
case PTL2_SIZE: mask = ~PFL2_DIRTY; break;
case PTL3_SIZE: mask = ~PFL3_DIRTY; break;
}
asm volatile ("lock andq %0,%1" :: "r"(mask), "m"(*ptep));
return;
}
static inline void pte_set_dirty(pte_t *ptep, size_t pgsize)
{
uint64_t mask;
switch (pgsize) {
default: /* through */
case PTL1_SIZE: mask = PFL1_DIRTY; break;
case PTL2_SIZE: mask = PFL2_DIRTY; break;
case PTL3_SIZE: mask = PFL3_DIRTY; break;
}
asm volatile ("lock orq %0,%1" :: "r"(mask), "m"(*ptep));
return;
}
struct page_table;
void set_pte(pte_t *ppte, unsigned long phys, enum ihk_mc_pt_attribute attr);
pte_t *get_pte(struct page_table *pt, void *virt, enum ihk_mc_pt_attribute attr);
void *early_alloc_page(void);
void *early_alloc_pages(int nr_pages);
void *get_last_early_heap(void);
void flush_tlb(void);
void flush_tlb_single(unsigned long addr);
void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable);
#define AP_TRAMPOLINE 0x10000
#define AP_TRAMPOLINE_SIZE 0x4000
extern unsigned long ap_trampoline;
//#define AP_TRAMPOLINE 0x10000
#define AP_TRAMPOLINE_SIZE 0x2000
/* Local is cachable */
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE | PTATTR_UNCACHABLE)
#define IHK_IKC_QUEUE_PT_ATTR (PTATTR_NO_EXECUTE | PTATTR_WRITABLE)
#endif

View File

@ -0,0 +1,23 @@
#ifndef _ASM_X86_STRING_H
#define _ASM_X86_STRING_H
#define ARCH_FAST_MEMCPY
static inline void *__inline_memcpy(void *to, const void *from, size_t n)
{
unsigned long d0, d1, d2;
asm volatile("rep ; movsl\n\t"
"testb $2,%b4\n\t"
"je 1f\n\t"
"movsw\n"
"1:\ttestb $1,%b4\n\t"
"je 2f\n\t"
"movsb\n"
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
: "memory");
return to;
}
#endif

View File

@ -0,0 +1,18 @@
/**
* \file auxvec.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent constants for auxiliary vector
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_AUXVEC_H
#define ARCH_AUXVEC_H
#define AT_SYSINFO_EHDR 33
#endif

View File

@ -0,0 +1,37 @@
/**
* \file cpu.h
* License details are found in the file LICENSE.
* \brief
* Declare architecture-dependent types and functions to control CPU.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
*/
#ifndef ARCH_CPU_H
#define ARCH_CPU_H
#include <ihk/cpu.h>
static inline void rmb(void)
{
barrier();
}
static inline void wmb(void)
{
barrier();
}
static unsigned long read_tsc(void)
{
unsigned int low, high;
asm volatile("rdtsc" : "=a"(low), "=d"(high));
return (low | ((unsigned long)high << 32));
}
#endif /* ARCH_CPU_H */

View File

@ -0,0 +1,16 @@
#ifndef __ARCH_MM_H
#define __ARCH_MM_H
struct process_vm;
static inline void
flush_nfo_tlb()
{
}
static inline void
flush_nfo_tlb_mm(struct process_vm *vm)
{
}
#endif

View File

@ -0,0 +1,40 @@
/**
* \file mman.h
* License details are found in the file LICENSE.
* \brief
* memory management declarations
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef HEADER_ARCH_MMAN_H
#define HEADER_ARCH_MMAN_H
/*
* mapping flags
*/
#define MAP_32BIT 0x40
#define MAP_GROWSDOWN 0x0100
#define MAP_DENYWRITE 0x0800
#define MAP_EXECUTABLE 0x1000
#define MAP_LOCKED 0x2000
#define MAP_NORESERVE 0x4000
#define MAP_POPULATE 0x8000
#define MAP_NONBLOCK 0x00010000
#define MAP_STACK 0x00020000
#define MAP_HUGETLB 0x00040000
#define MAP_HUGE_SHIFT 26
#define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT)
#define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT)
/*
* for mlockall()
*/
#define MCL_CURRENT 0x01
#define MCL_FUTURE 0x02
#endif /* HEADER_ARCH_MMAN_H */

View File

@ -0,0 +1,46 @@
/**
* \file shm.h
* License details are found in the file LICENSE.
* \brief
* header file for System V shared memory
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef HEADER_ARCH_SHM_H
#define HEADER_ARCH_SHM_H
/* shmflg */
#define SHM_HUGE_SHIFT 26
#define SHM_HUGE_2MB (21 << SHM_HUGE_SHIFT)
#define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT)
struct ipc_perm {
key_t key;
uid_t uid;
gid_t gid;
uid_t cuid;
gid_t cgid;
uint16_t mode;
uint8_t padding[2];
uint16_t seq;
uint8_t padding2[22];
};
struct shmid_ds {
struct ipc_perm shm_perm;
size_t shm_segsz;
time_t shm_atime;
time_t shm_dtime;
time_t shm_ctime;
pid_t shm_cpid;
pid_t shm_lpid;
uint64_t shm_nattch;
uint8_t padding[12];
int init_pgshift;
};
#endif /* HEADER_ARCH_SHM_H */

View File

@ -22,7 +22,7 @@
* - 4096 : kernel stack
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_CPU_LOCAL_OFFSET_KSTACK 16
#define X86_CPU_LOCAL_OFFSET_USTACK 24
@ -39,10 +39,13 @@ struct x86_cpu_local_variables {
struct x86_desc_ptr gdt_ptr;
unsigned short pad[3];
/* 48 */
uint64_t gdt[10];
/* 128 */
uint64_t gdt[16];
/* 176 */
struct tss64 tss;
/* 280 */
unsigned long paniced;
uint64_t panic_regs[21];
/* 456 */
} __attribute__((packed));
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id);

View File

@ -1,40 +1,7 @@
#ifndef _ASM_GENERIC_ERRNO_BASE_H
#define _ASM_GENERIC_ERRNO_BASE_H
#ifndef _ERRNO_BASE_H
#define _ERRNO_BASE_H
#define EPERM 1 /* Operation not permitted */
#define ENOENT 2 /* No such file or directory */
#define ESRCH 3 /* No such process */
#define EINTR 4 /* Interrupted system call */
#define EIO 5 /* I/O error */
#define ENXIO 6 /* No such device or address */
#define E2BIG 7 /* Argument list too long */
#define ENOEXEC 8 /* Exec format error */
#define EBADF 9 /* Bad file number */
#define ECHILD 10 /* No child processes */
#define EAGAIN 11 /* Try again */
#define ENOMEM 12 /* Out of memory */
#define EACCES 13 /* Permission denied */
#define EFAULT 14 /* Bad address */
#define ENOTBLK 15 /* Block device required */
#define EBUSY 16 /* Device or resource busy */
#define EEXIST 17 /* File exists */
#define EXDEV 18 /* Cross-device link */
#define ENODEV 19 /* No such device */
#define ENOTDIR 20 /* Not a directory */
#define EISDIR 21 /* Is a directory */
#define EINVAL 22 /* Invalid argument */
#define ENFILE 23 /* File table overflow */
#define EMFILE 24 /* Too many open files */
#define ENOTTY 25 /* Not a typewriter */
#define ETXTBSY 26 /* Text file busy */
#define EFBIG 27 /* File too large */
#define ENOSPC 28 /* No space left on device */
#define ESPIPE 29 /* Illegal seek */
#define EROFS 30 /* Read-only file system */
#define EMLINK 31 /* Too many links */
#define EPIPE 32 /* Broken pipe */
#define EDOM 33 /* Math argument out of domain of func */
#define ERANGE 34 /* Math result not representable */
#include <generic-errno.h>
#define EDEADLK 35 /* Resource deadlock would occur */
#define ENAMETOOLONG 36 /* File name too long */
@ -141,29 +108,4 @@
#define ERFKILL 132 /* Operation not possible due to RF-kill */
#ifdef __KERNEL__
/* Should never be seen by user programs */
#define ERESTARTSYS 512
#define ERESTARTNOINTR 513
#define ERESTARTNOHAND 514 /* restart if no handler.. */
#define ENOIOCTLCMD 515 /* No ioctl command */
#define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
/* Defined for the NFSv3 protocol */
#define EBADHANDLE 521 /* Illegal NFS file handle */
#define ENOTSYNC 522 /* Update synchronization mismatch */
#define EBADCOOKIE 523 /* Cookie is stale */
#define ENOTSUPP 524 /* Operation is not supported */
#define ETOOSMALL 525 /* Buffer or request is too small */
#define ESERVERFAULT 526 /* An untranslatable error occurred */
#define EBADTYPE 527 /* Type not supported by server */
#define EJUKEBOX 528 /* Request initiated, but will not complete before timeout */
#define EIOCBQUEUED 529 /* iocb queued, will get completion event */
#define EIOCBRETRY 530 /* iocb queued, will trigger a retry */
#endif
#endif

View File

@ -13,6 +13,10 @@
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
#define HEADER_X86_COMMON_IHK_ATOMIC_H
/***********************************************************************
* ihk_atomic_t
*/
typedef struct {
int counter;
} ihk_atomic_t;
@ -95,6 +99,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define ihk_atomic_inc_return(v) (ihk_atomic_add_return(1, v))
#define ihk_atomic_dec_return(v) (ihk_atomic_sub_return(1, v))
/***********************************************************************
* ihk_atomic64_t
*/
typedef struct {
long counter64;
} ihk_atomic64_t;
#define IHK_ATOMIC64_INIT(i) { .counter64 = (i) }
static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
{
return *(volatile long *)&(v)->counter64;
}
static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
{
asm volatile ("lock incq %0" : "+m"(v->counter64));
}
/***********************************************************************
* others
*/
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway
* Note 2: xchg has side effect, so that attribute volatile is necessary,
@ -112,6 +140,17 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
__x; \
})
static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
{
unsigned long __x = (x);
asm volatile("xchgq %0,%1"
: "=r" (__x)
: "m" (*(volatile unsigned long*)(ptr)), "0" (__x)
: "memory");
return __x;
}
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
@ -150,5 +189,30 @@ static inline int ihk_atomic_sub_return(int i, ihk_atomic_t *v)
#define xchg(ptr, v) \
__xchg((v), (ptr), sizeof(*ptr))
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
unsigned long oldval,
unsigned long newval)
{
asm volatile("lock; cmpxchgq %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
static inline unsigned long atomic_cmpxchg4(unsigned int *addr,
unsigned int oldval,
unsigned int newval)
{
asm volatile("lock; cmpxchgl %2, %1\n"
: "=a" (oldval), "+m" (*addr)
: "r" (newval), "0" (oldval)
: "memory"
);
return oldval;
}
#endif

View File

@ -22,19 +22,35 @@ struct x86_kregs {
};
typedef struct x86_kregs ihk_mc_kernel_context_t;
/* XXX: User context should contain floating point registers */
typedef struct x86_regs ihk_mc_user_context_t;
struct x86_user_context {
struct x86_sregs sr;
#define ihk_mc_syscall_arg0(uc) (uc)->rdi
#define ihk_mc_syscall_arg1(uc) (uc)->rsi
#define ihk_mc_syscall_arg2(uc) (uc)->rdx
#define ihk_mc_syscall_arg3(uc) (uc)->r10
#define ihk_mc_syscall_arg4(uc) (uc)->r8
#define ihk_mc_syscall_arg5(uc) (uc)->r9
/* 16-byte boundary here */
uint8_t is_gpr_valid;
uint8_t is_sr_valid;
uint8_t spare_flags6;
uint8_t spare_flags5;
uint8_t spare_flags4;
uint8_t spare_flags3;
uint8_t spare_flags2;
uint8_t spare_flags1;
struct x86_basic_regs gpr; /* must be last */
/* 16-byte boundary here */
};
typedef struct x86_user_context ihk_mc_user_context_t;
#define ihk_mc_syscall_ret(uc) (uc)->rax
#define ihk_mc_syscall_arg0(uc) (uc)->gpr.rdi
#define ihk_mc_syscall_arg1(uc) (uc)->gpr.rsi
#define ihk_mc_syscall_arg2(uc) (uc)->gpr.rdx
#define ihk_mc_syscall_arg3(uc) (uc)->gpr.r10
#define ihk_mc_syscall_arg4(uc) (uc)->gpr.r8
#define ihk_mc_syscall_arg5(uc) (uc)->gpr.r9
#define ihk_mc_syscall_pc(uc) (uc)->rip
#define ihk_mc_syscall_sp(uc) (uc)->rsp
#define ihk_mc_syscall_ret(uc) (uc)->gpr.rax
#define ihk_mc_syscall_pc(uc) (uc)->gpr.rip
#define ihk_mc_syscall_sp(uc) (uc)->gpr.rsp
#endif

View File

@ -31,9 +31,5 @@ typedef int64_t off_t;
#define NULL ((void *)0)
#define BITS_PER_LONG_SHIFT 6
#define BITS_PER_LONG (1 << BITS_PER_LONG_SHIFT)
#endif

View File

@ -0,0 +1,17 @@
/**
* \file prctl.h
* License details are found in the file LICENSE.
*/
/*
* HISTORY
*/
#ifndef __ARCH_PRCTL_H
#define __ARCH_PRCTL_H
#define ARCH_SET_GS 0x1001
#define ARCH_SET_FS 0x1002
#define ARCH_GET_FS 0x1003
#define ARCH_GET_GS 0x1004
#endif

View File

@ -6,6 +6,8 @@
* Machine Specific Registers (MSR)
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -16,7 +18,31 @@
#include <types.h>
#define RFLAGS_CF (1 << 0)
#define RFLAGS_PF (1 << 2)
#define RFLAGS_AF (1 << 4)
#define RFLAGS_ZF (1 << 6)
#define RFLAGS_SF (1 << 7)
#define RFLAGS_TF (1 << 8)
#define RFLAGS_IF (1 << 9)
#define RFLAGS_DF (1 << 10)
#define RFLAGS_OF (1 << 11)
#define RFLAGS_IOPL (3 << 12)
#define RFLAGS_NT (1 << 14)
#define RFLAGS_RF (1 << 16)
#define RFLAGS_VM (1 << 17)
#define RFLAGS_AC (1 << 18)
#define RFLAGS_VIF (1 << 19)
#define RFLAGS_VIP (1 << 20)
#define RFLAGS_ID (1 << 21)
#define DB6_B0 (1 << 0)
#define DB6_B1 (1 << 1)
#define DB6_B2 (1 << 2)
#define DB6_B3 (1 << 3)
#define DB6_BD (1 << 13)
#define DB6_BS (1 << 14)
#define DB6_BT (1 << 15)
#define MSR_EFER 0xc0000080
#define MSR_STAR 0xc0000081
@ -26,6 +52,14 @@
#define MSR_GS_BASE 0xc0000101
#define MSR_IA32_APIC_BASE 0x000000001b
#define MSR_PLATFORM_INFO 0x000000ce
#define MSR_IA32_PERF_CTL 0x00000199
#define MSR_IA32_MISC_ENABLE 0x000001a0
#define MSR_IA32_ENERGY_PERF_BIAS 0x000001b0
#define MSR_NHM_TURBO_RATIO_LIMIT 0x000001ad
#define MSR_IA32_CR_PAT 0x00000277
#define MSR_IA32_XSS 0xda0
#define CVAL(event, mask) \
((((event) & 0xf00) << 24) | ((mask) << 8) | ((event) & 0xff))
@ -37,6 +71,25 @@
#define MSR_PERF_CTL_0 0xc0010000
#define MSR_PERF_CTR_0 0xc0010004
static unsigned long xgetbv(unsigned int index)
{
unsigned int low, high;
asm volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (index));
return low | ((unsigned long)high << 32);
}
static void xsetbv(unsigned int index, unsigned long val)
{
unsigned int low, high;
low = val;
high = val >> 32;
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
}
static void wrmsr(unsigned int idx, unsigned long value){
unsigned int high, low;
@ -135,10 +188,19 @@ struct tss64 {
unsigned short iomap_address;
} __attribute__((packed));
struct x86_regs {
unsigned long r15, r14, r13, r12, r11, r10, r9, r8;
unsigned long rdi, rsi, rdx, rcx, rbx, rax, rbp;
unsigned long error, rip, cs, rflags, rsp, ss;
struct x86_basic_regs {
unsigned long r15, r14, r13, r12, rbp, rbx, r11, r10;
unsigned long r9, r8, rax, rcx, rdx, rsi, rdi, error;
unsigned long rip, cs, rflags, rsp, ss;
};
struct x86_sregs {
unsigned long fs_base;
unsigned long gs_base;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
};
#define REGS_GET_STACK_POINTER(regs) (((struct x86_regs *)regs)->rsp)
@ -162,7 +224,72 @@ enum x86_pf_error_code {
PF_RSVD = 1 << 3,
PF_INSTR = 1 << 4,
PF_PATCH = 1 << 29,
PF_POPULATE = 1 << 30,
};
struct i387_fxsave_struct {
unsigned short cwd;
unsigned short swd;
unsigned short twd;
unsigned short fop;
union {
struct {
unsigned long rip;
unsigned long rdp;
};
struct {
unsigned int fip;
unsigned int fcs;
unsigned int foo;
unsigned int fos;
};
};
unsigned int mxcsr;
unsigned int mxcsr_mask;
unsigned int st_space[32];
unsigned int xmm_space[64];
unsigned int padding[12];
union {
unsigned int padding1[12];
unsigned int sw_reserved[12];
};
} __attribute__((aligned(16)));
struct ymmh_struct {
unsigned int ymmh_space[64];
};
struct lwp_struct {
unsigned char reserved[128];
};
struct bndreg {
unsigned long lower_bound;
unsigned long upper_bound;
} __attribute__((packed));
struct bndcsr {
unsigned long bndcfgu;
unsigned long bndstatus;
} __attribute__((packed));
struct xsave_hdr_struct {
unsigned long xstate_bv;
unsigned long xcomp_bv;
unsigned long reserved[6];
} __attribute__((packed));
struct xsave_struct {
struct i387_fxsave_struct i387;
struct xsave_hdr_struct xsave_hdr;
struct ymmh_struct ymmh;
struct lwp_struct lwp;
struct bndreg bndreg[4];
struct bndcsr bndcsr;
} __attribute__ ((packed, aligned (64)));
typedef struct xsave_struct fp_regs_struct;
#endif

View File

@ -90,10 +90,6 @@ enum __rlimit_resource
#define RLIM_NLIMITS __RLIM_NLIMITS
};
struct rlimit {
uint64_t rlim_cur; /* Soft limit */
uint64_t rlim_max; /* Hard limit (ceiling for rlim_cur) */
};
#include <generic-rlimit.h>
#endif

View File

@ -20,12 +20,13 @@
* syscall_name[] only, no handler exists.
*/
SYSCALL_DELEGATED(0, read)
SYSCALL_HANDLED(0, read)
SYSCALL_DELEGATED(1, write)
SYSCALL_DELEGATED(2, open)
SYSCALL_DELEGATED(3, close)
SYSCALL_HANDLED(3, close)
SYSCALL_DELEGATED(4, stat)
SYSCALL_DELEGATED(5, fstat)
SYSCALL_DELEGATED(7, poll)
SYSCALL_DELEGATED(8, lseek)
SYSCALL_HANDLED(9, mmap)
SYSCALL_HANDLED(10, mprotect)
@ -34,15 +35,24 @@ SYSCALL_HANDLED(12, brk)
SYSCALL_HANDLED(13, rt_sigaction)
SYSCALL_HANDLED(14, rt_sigprocmask)
SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_DELEGATED(16, ioctl)
SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev)
SYSCALL_DELEGATED(21, access)
SYSCALL_DELEGATED(23, select)
SYSCALL_HANDLED(24, sched_yield)
SYSCALL_HANDLED(25, mremap)
SYSCALL_HANDLED(26, msync)
SYSCALL_HANDLED(27, mincore)
SYSCALL_HANDLED(28, madvise)
SYSCALL_HANDLED(29, shmget)
SYSCALL_HANDLED(30, shmat)
SYSCALL_HANDLED(31, shmctl)
SYSCALL_HANDLED(34, pause)
SYSCALL_HANDLED(35, nanosleep)
SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork)
@ -52,47 +62,96 @@ SYSCALL_HANDLED(60, exit)
SYSCALL_HANDLED(61, wait4)
SYSCALL_HANDLED(62, kill)
SYSCALL_DELEGATED(63, uname)
SYSCALL_DELEGATED(72, fcntl)
SYSCALL_DELEGATED(65, semop)
SYSCALL_HANDLED(67, shmdt)
SYSCALL_DELEGATED(69, msgsnd)
SYSCALL_DELEGATED(70, msgrcv)
SYSCALL_HANDLED(72, fcntl)
SYSCALL_DELEGATED(79, getcwd)
SYSCALL_DELEGATED(89, readlink)
SYSCALL_DELEGATED(96, gettimeofday)
SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace)
SYSCALL_DELEGATED(102, getuid)
SYSCALL_DELEGATED(104, getgid)
SYSCALL_DELEGATED(107, geteuid)
SYSCALL_DELEGATED(108, getegid)
SYSCALL_HANDLED(102, getuid)
SYSCALL_HANDLED(104, getgid)
SYSCALL_HANDLED(105, setuid)
SYSCALL_HANDLED(106, setgid)
SYSCALL_HANDLED(107, geteuid)
SYSCALL_HANDLED(108, getegid)
SYSCALL_HANDLED(109, setpgid)
SYSCALL_DELEGATED(110, getppid)
SYSCALL_HANDLED(110, getppid)
SYSCALL_DELEGATED(111, getpgrp)
SYSCALL_HANDLED(113, setreuid)
SYSCALL_HANDLED(114, setregid)
SYSCALL_HANDLED(117, setresuid)
SYSCALL_HANDLED(118, getresuid)
SYSCALL_HANDLED(119, setresgid)
SYSCALL_HANDLED(120, getresgid)
SYSCALL_HANDLED(122, setfsuid)
SYSCALL_HANDLED(123, setfsgid)
SYSCALL_HANDLED(127, rt_sigpending)
SYSCALL_HANDLED(128, rt_sigtimedwait)
SYSCALL_HANDLED(129, rt_sigqueueinfo)
SYSCALL_HANDLED(130, rt_sigsuspend)
SYSCALL_HANDLED(131, sigaltstack)
SYSCALL_HANDLED(142, sched_setparam)
SYSCALL_HANDLED(143, sched_getparam)
SYSCALL_HANDLED(144, sched_setscheduler)
SYSCALL_HANDLED(145, sched_getscheduler)
SYSCALL_HANDLED(146, sched_get_priority_max)
SYSCALL_HANDLED(147, sched_get_priority_min)
SYSCALL_HANDLED(148, sched_rr_get_interval)
SYSCALL_HANDLED(149, mlock)
SYSCALL_HANDLED(150, munlock)
SYSCALL_HANDLED(151, mlockall)
SYSCALL_HANDLED(152, munlockall)
SYSCALL_HANDLED(158, arch_prctl)
SYSCALL_HANDLED(160, setrlimit)
SYSCALL_HANDLED(164, settimeofday)
SYSCALL_HANDLED(186, gettid)
SYSCALL_HANDLED(200, tkill)
SYSCALL_DELEGATED(201, time)
SYSCALL_HANDLED(202, futex)
SYSCALL_HANDLED(203, sched_setaffinity)
SYSCALL_HANDLED(204, sched_getaffinity)
SYSCALL_DELEGATED(208, io_getevents)
SYSCALL_HANDLED(216, remap_file_pages)
SYSCALL_DELEGATED(217, getdents64)
SYSCALL_HANDLED(218, set_tid_address)
SYSCALL_DELEGATED(220, semtimedop)
SYSCALL_HANDLED(228, clock_gettime)
SYSCALL_DELEGATED(230, clock_nanosleep)
SYSCALL_HANDLED(231, exit_group)
SYSCALL_DELEGATED(232, epoll_wait)
SYSCALL_HANDLED(234, tgkill)
SYSCALL_HANDLED(237, mbind)
SYSCALL_HANDLED(238, set_mempolicy)
SYSCALL_HANDLED(239, get_mempolicy)
SYSCALL_HANDLED(247, waitid)
SYSCALL_HANDLED(256, migrate_pages)
SYSCALL_DELEGATED(270, pselect6)
SYSCALL_DELEGATED(271, ppoll)
SYSCALL_HANDLED(273, set_robust_list)
SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4)
SYSCALL_HANDLED(298, perf_event_open)
#ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call)
#endif
SYSCALL_HANDLED(309, getcpu)
SYSCALL_HANDLED(310, process_vm_readv)
SYSCALL_HANDLED(311, process_vm_writev)
SYSCALL_HANDLED(601, pmc_init)
SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef TRACK_SYSCALLS
SYSCALL_HANDLED(701, syscall_offload_clr_cntrs)
#endif // TRACK_SYSCALLS
/**** End of File ****/

View File

@ -13,7 +13,7 @@
* 2013/?? - bgerofi + shimosawa: handle rsp correctly for nested interrupts
*/
#define X86_CPU_LOCAL_OFFSET_TSS 128
#define X86_CPU_LOCAL_OFFSET_TSS 176
#define X86_TSS_OFFSET_SP0 4
#define X86_CPU_LOCAL_OFFSET_SP0 \
(X86_CPU_LOCAL_OFFSET_TSS + X86_TSS_OFFSET_SP0)
@ -24,39 +24,56 @@
#define USER_CS (48 + 3)
#define USER_DS (56 + 3)
#define PUSH_ALL_REGS \
pushq %rbp; \
pushq %rax; \
pushq %rbx; \
pushq %rcx; \
pushq %rdx; \
pushq %rsi; \
pushq %rdi; \
pushq %r8; \
pushq %r9; \
pushq %r10; \
pushq %r11; \
pushq %r12; \
pushq %r13; \
pushq %r14; \
pushq %r15;
#define POP_ALL_REGS \
popq %r15; \
popq %r14; \
popq %r13; \
popq %r12; \
popq %r11; \
popq %r10; \
popq %r9; \
popq %r8; \
popq %rdi; \
popq %rsi; \
popq %rdx; \
popq %rcx; \
popq %rbx; \
popq %rax; \
popq %rbp
/* struct x86_user_context */
#define X86_SREGS_BASE (0)
#define X86_SREGS_SIZE 48
#define X86_FLAGS_BASE (X86_SREGS_BASE + X86_SREGS_SIZE)
#define X86_FLAGS_SIZE 8
#define X86_REGS_BASE (X86_FLAGS_BASE + X86_FLAGS_SIZE)
#define RAX_OFFSET (X86_REGS_BASE + 80)
#define ERROR_OFFSET (X86_REGS_BASE + 120)
#define RSP_OFFSET (X86_REGS_BASE + 152)
#define PUSH_ALL_REGS \
pushq %rdi; \
pushq %rsi; \
pushq %rdx; \
pushq %rcx; \
pushq %rax; \
pushq %r8; \
pushq %r9; \
pushq %r10; \
pushq %r11; \
pushq %rbx; \
pushq %rbp; \
pushq %r12; \
pushq %r13; \
pushq %r14; \
pushq %r15; \
pushq $1; /* is_gpr_valid is set, and others are cleared */ \
subq $X86_FLAGS_BASE,%rsp /* for x86_sregs, etc. */
#define POP_ALL_REGS \
movq $0,X86_FLAGS_BASE(%rsp); /* clear all flags */ \
addq $X86_REGS_BASE,%rsp; /* discard x86_sregs, flags, etc. */ \
popq %r15; \
popq %r14; \
popq %r13; \
popq %r12; \
popq %rbp; \
popq %rbx; \
popq %r11; \
popq %r10; \
popq %r9; \
popq %r8; \
popq %rax; \
popq %rcx; \
popq %rdx; \
popq %rsi; \
popq %rdi
.data
.globl generic_common_handlers
generic_common_handlers:
@ -75,7 +92,7 @@ vector=vector+1
common_interrupt:
PUSH_ALL_REGS
movq 120(%rsp), %rdi
movq ERROR_OFFSET(%rsp), %rdi
movq %rsp, %rsi
call handle_interrupt /* Enter C code */
POP_ALL_REGS
@ -91,7 +108,7 @@ page_fault:
cld
PUSH_ALL_REGS
movq %cr2, %rdi
movq 120(%rsp),%rsi
movq ERROR_OFFSET(%rsp),%rsi
movq %rsp,%rdx
movq __page_fault_handler_address(%rip), %rax
andq %rax, %rax
@ -113,10 +130,53 @@ general_protection_exception:
addq $8, %rsp
iretq
.globl nmi
nmi:
#define PANICED 232
#define PANIC_REGS 240
movq %rax,%gs:PANIC_REGS+0x00
movq %rbx,%gs:PANIC_REGS+0x08
movq %rcx,%gs:PANIC_REGS+0x10
movq %rdx,%gs:PANIC_REGS+0x18
movq %rsi,%gs:PANIC_REGS+0x20
movq %rdi,%gs:PANIC_REGS+0x28
movq %rbp,%gs:PANIC_REGS+0x30
movq 0x18(%rsp),%rax /* rsp */
movq %rax,%gs:PANIC_REGS+0x38
movq %r8, %gs:PANIC_REGS+0x40
movq %r9, %gs:PANIC_REGS+0x48
movq %r10,%gs:PANIC_REGS+0x50
movq %r11,%gs:PANIC_REGS+0x58
movq %r12,%gs:PANIC_REGS+0x60
movq %r13,%gs:PANIC_REGS+0x68
movq %r14,%gs:PANIC_REGS+0x70
movq %r15,%gs:PANIC_REGS+0x78
movq 0x00(%rsp),%rax /* rip */
movq %rax,%gs:PANIC_REGS+0x80
movq 0x10(%rsp),%rax /* rflags */
movl %eax,%gs:PANIC_REGS+0x88
movq 0x08(%rsp),%rax /* cs */
movl %eax,%gs:PANIC_REGS+0x8C
movq 0x20(%rsp),%rax /* ss */
movl %eax,%gs:PANIC_REGS+0x90
xorq %rax,%rax
movw %ds,%ax
movl %eax,%gs:PANIC_REGS+0x94
movw %es,%ax
movl %eax,%gs:PANIC_REGS+0x98
movw %fs,%ax
movl %eax,%gs:PANIC_REGS+0x9C
movw %gs,%ax
movl %eax,%gs:PANIC_REGS+0xA0
movq $1,%gs:PANICED
1:
hlt
jmp 1b
.globl x86_syscall
x86_syscall:
cld
movq %rsp, %gs:24
movq %rsp, %gs:X86_CPU_LOCAL_OFFSET_USTACK
movq %gs:(X86_CPU_LOCAL_OFFSET_SP0), %rsp
pushq $(USER_DS)
@ -124,21 +184,19 @@ x86_syscall:
pushq %r11
pushq $(USER_CS)
pushq %rcx
pushq $0
movq %gs:24, %rcx
movq %rcx, 32(%rsp)
pushq %rax /* error code (= system call number) */
PUSH_ALL_REGS
movq 104(%rsp), %rdi
movq %gs:X86_CPU_LOCAL_OFFSET_USTACK, %rcx
movq %rcx, RSP_OFFSET(%rsp)
movq RAX_OFFSET(%rsp), %rdi
movw %ss, %ax
movw %ax, %ds
movq %rsp, %rsi
callq *__x86_syscall_handler(%rip)
1:
movq %rax, 104(%rsp)
movq %rax, RAX_OFFSET(%rsp)
POP_ALL_REGS
#ifdef USE_SYSRET
movq 8(%rsp), %rcx
movq 24(%rsp), %r11
movq 32(%rsp), %rsp
sysretq
#else
@ -147,7 +205,35 @@ x86_syscall:
#endif
.globl enter_user_mode
enter_user_mode:
enter_user_mode:
callq release_runq_lock
movq $0, %rdi
movq %rsp, %rsi
call check_signal
movq $0, %rdi
call set_cputime
POP_ALL_REGS
addq $8, %rsp
iretq
.globl debug_exception
debug_exception:
cld
pushq $0 /* error */
PUSH_ALL_REGS
movq %rsp, %rdi
call debug_handler
POP_ALL_REGS
addq $8, %rsp
iretq
.globl int3_exception
int3_exception:
cld
pushq $0 /* error */
PUSH_ALL_REGS
movq %rsp, %rdi
call int3_handler
POP_ALL_REGS
addq $8, %rsp
iretq

View File

@ -6,6 +6,8 @@
* resides in memory.
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY
@ -19,26 +21,37 @@
#include <registers.h>
#include <string.h>
#define LOCALS_SPAN (4 * PAGE_SIZE)
struct x86_cpu_local_variables *locals;
size_t x86_cpu_local_variables_span = LOCALS_SPAN; /* for debugger */
void init_processors_local(int max_id)
{
size_t size;
size = LOCALS_SPAN * max_id;
/* Is contiguous allocating adequate?? */
locals = ihk_mc_alloc_pages(max_id, IHK_MC_AP_CRITICAL);
memset(locals, 0, PAGE_SIZE * max_id);
locals = ihk_mc_alloc_pages(size/PAGE_SIZE, IHK_MC_AP_CRITICAL);
memset(locals, 0, size);
kprintf("locals = %p\n", locals);
}
/*@
@ requires \valid(id);
@ ensures \result == locals + (LOCALS_SPAN * id);
@ assigns \nothing;
@*/
struct x86_cpu_local_variables *get_x86_cpu_local_variable(int id)
{
return (struct x86_cpu_local_variables *)
((char *)locals + (id << PAGE_SHIFT));
((char *)locals + (LOCALS_SPAN * id));
}
static void *get_x86_cpu_local_kstack(int id)
{
return ((char *)locals + ((id + 1) << PAGE_SHIFT));
return ((char *)locals + (LOCALS_SPAN * (id + 1)));
}
struct x86_cpu_local_variables *get_x86_this_cpu_local(void)
@ -80,7 +93,20 @@ void assign_processor_id(void)
v->processor_id = id;
}
void init_boot_processor_local(void)
{
static struct x86_cpu_local_variables avar;
memset(&avar, -1, sizeof(avar));
set_gs_base(&avar);
return;
}
/** IHK **/
/*@
@ ensures \result == %gs;
@ assigns \nothing;
*/
int ihk_mc_get_processor_id(void)
{
int id;
@ -90,6 +116,10 @@ int ihk_mc_get_processor_id(void)
return id;
}
/*@
@ ensures \result == (locals + (LOCALS_SPAN * %gs))->apic_id;
@ assigns \nothing;
*/
int ihk_mc_get_hardware_processor_id(void)
{
struct x86_cpu_local_variables *v = get_x86_this_cpu_local();

File diff suppressed because it is too large Load Diff

View File

@ -16,6 +16,7 @@
#include <memory.h>
#include <string.h>
extern int num_processors;
extern void arch_set_mikc_queue(void *r, void *w);
ihk_ikc_ph_t arch_master_channel_packet_handler;
@ -23,22 +24,28 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
ihk_ikc_ph_t packet_handler)
{
struct ihk_ikc_queue_head *rq, *wq;
size_t mikc_queue_pages;
ihk_ikc_system_init(NULL);
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
/* Place both sides in this side */
rq = arch_alloc_page(IHK_MC_AP_CRITICAL);
wq = arch_alloc_page(IHK_MC_AP_CRITICAL);
mikc_queue_pages = ((num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
ihk_ikc_init_queue(rq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0, PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
/* Place both sides in this side */
rq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
wq = ihk_mc_alloc_pages(mikc_queue_pages, IHK_MC_AP_CRITICAL);
ihk_ikc_init_queue(rq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
ihk_ikc_init_queue(wq, 0, 0,
mikc_queue_pages * PAGE_SIZE, MASTER_IKCQ_PKTSIZE);
arch_master_channel_packet_handler = packet_handler;
ihk_ikc_init_desc(channel, IKC_OS_HOST, 0, rq, wq,
ihk_ikc_master_channel_packet_handler);
ihk_ikc_master_channel_packet_handler, channel);
ihk_ikc_enable_channel(channel);
/* Set boot parameter */

View File

@ -12,19 +12,72 @@
#include <errno.h>
#include <ihk/debug.h>
#include <registers.h>
#include <mc_perf_event.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
#define X86_CR4_PCE 0x00000100
int perf_counters_discovered = 0;
int X86_IA32_NUM_PERF_COUNTERS = 0;
unsigned long X86_IA32_PERF_COUNTERS_MASK = 0;
int X86_IA32_NUM_FIXED_PERF_COUNTERS = 0;
unsigned long X86_IA32_FIXED_PERF_COUNTERS_MASK = 0;
void x86_init_perfctr(void)
{
int i = 0;
unsigned long reg;
unsigned long value = 0;
uint64_t op;
uint64_t eax;
uint64_t ebx;
uint64_t ecx;
uint64_t edx;
/* Do not do it on KVM */
if (running_on_kvm()) return;
/* Allow PMC to be read from user space */
asm volatile("movq %%cr4, %0" : "=r"(reg));
reg |= X86_CR4_PCE;
asm volatile("movq %0, %%cr4" : : "r"(reg));
/* Detect number of supported performance counters */
if (!perf_counters_discovered) {
/* See Table 35.2 - Architectural MSRs in Vol 3C */
op = 0x0a;
asm volatile("cpuid" : "=a"(eax),"=b"(ebx),"=c"(ecx),"=d"(edx):"a"(op));
X86_IA32_NUM_PERF_COUNTERS = ((eax & 0xFF00) >> 8);
X86_IA32_PERF_COUNTERS_MASK = (1 << X86_IA32_NUM_PERF_COUNTERS) - 1;
X86_IA32_NUM_FIXED_PERF_COUNTERS = (edx & 0x0F);
X86_IA32_FIXED_PERF_COUNTERS_MASK =
((1UL << X86_IA32_NUM_FIXED_PERF_COUNTERS) - 1) <<
X86_IA32_BASE_FIXED_PERF_COUNTERS;
perf_counters_discovered = 1;
kprintf("X86_IA32_NUM_PERF_COUNTERS: %d, X86_IA32_NUM_FIXED_PERF_COUNTERS: %d\n",
X86_IA32_NUM_PERF_COUNTERS, X86_IA32_NUM_FIXED_PERF_COUNTERS);
}
/* Clear Fixed Counter Control */
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= 0xfffffffffffff000L;
wrmsr(MSR_PERF_FIXED_CTRL, value);
/* Clear Generic Counter Control */
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
value |= X86_IA32_FIXED_PERF_COUNTERS_MASK;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
}
static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
@ -33,20 +86,53 @@ static int set_perfctr_x86_direct(int counter, int mode, unsigned int value)
return -EINVAL;
}
if (mode & PERFCTR_USER_MODE) {
// clear mode flags
value &= ~(3 << 16);
// set mode flags
if(mode & PERFCTR_USER_MODE) {
value |= 1 << 16;
}
if (mode & PERFCTR_KERNEL_MODE) {
}
if(mode & PERFCTR_KERNEL_MODE) {
value |= 1 << 17;
}
}
// wrmsr(MSR_PERF_GLOBAL_CTRL, 0);
value |= (1 << 22) | (1 << 18); /* EN */
value |= (1 << 20); /* Enable overflow interrupt */
wrmsr(MSR_IA32_PERFEVTSEL0 + counter, value);
kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
//kprintf("wrmsr: %d <= %x\n", MSR_PERF_GLOBAL_CTRL, 0);
//kprintf("wrmsr: %d <= %x\n", MSR_IA32_PERFEVTSEL0 + counter, value);
return 0;
}
static int set_pmc_x86_direct(int counter, long val)
{
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
val &= 0x000000ffffffffff; // 40bit Mask
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// set generic pmc
wrmsr(MSR_IA32_PMC0 + counter, val);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// set fixed pmc
wrmsr(MSR_IA32_FIXED_CTR0 + counter - X86_IA32_BASE_FIXED_PERF_COUNTERS, val);
}
else {
return -EINVAL;
}
return 0;
}
@ -57,6 +143,45 @@ static int set_perfctr_x86(int counter, int event, int mask, int inv, int count,
CVAL2(event, mask, inv, count));
}
static int set_fixed_counter(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, code);
}
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
@ -78,14 +203,15 @@ extern void x86_march_perfctr_start(unsigned long counter_mask);
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
unsigned int value = 0;
unsigned long value = 0;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
#ifdef HAVE_MARCH_PERFCTR_START
x86_march_perfctr_start(counter_mask);
#endif
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= counter_mask;
value |= counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
return 0;
@ -93,25 +219,78 @@ int ihk_mc_perfctr_start(unsigned long counter_mask)
int ihk_mc_perfctr_stop(unsigned long counter_mask)
{
unsigned int value;
unsigned long value;
unsigned long mask = X86_IA32_PERF_COUNTERS_MASK | X86_IA32_FIXED_PERF_COUNTERS_MASK;
counter_mask &= ((1 << X86_IA32_NUM_PERF_COUNTERS) - 1);
counter_mask &= mask;
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value &= ~counter_mask;
wrmsr(MSR_PERF_GLOBAL_CTRL, value);
if(counter_mask >> 32 & 0x1) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x2) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 4);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
if(counter_mask >> 32 & 0x4) {
value = rdmsr(MSR_PERF_FIXED_CTRL);
value &= ~(0xf << 8);
wrmsr(MSR_PERF_FIXED_CTRL, value);
}
return 0;
}
// init for fixed counter
int ihk_mc_perfctr_fixed_init(int counter, int mode)
{
unsigned long value = 0;
unsigned int ctr_mask = 0xf;
int counter_idx = counter - X86_IA32_BASE_FIXED_PERF_COUNTERS ;
unsigned int set_val = 0;
if (counter_idx < 0 || counter_idx >= X86_IA32_NUM_FIXED_PERF_COUNTERS) {
return -EINVAL;
}
// clear specified fixed counter info
value = rdmsr(MSR_PERF_FIXED_CTRL);
ctr_mask <<= counter_idx * 4;
value &= ~ctr_mask;
if (mode & PERFCTR_USER_MODE) {
set_val |= 1 << 1;
}
if (mode & PERFCTR_KERNEL_MODE) {
set_val |= 1;
}
// enable PMI on overflow
set_val |= 1 << 3;
set_val <<= counter_idx * 4;
value |= set_val;
wrmsr(MSR_PERF_FIXED_CTRL, value);
return 0;
}
int ihk_mc_perfctr_reset(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
return -EINVAL;
}
return set_pmc_x86_direct(counter, 0);
}
wrmsr(MSR_IA32_PMC0 + counter, 0);
return 0;
int ihk_mc_perfctr_set(int counter, long val)
{
return set_pmc_x86_direct(counter, val);
}
int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
@ -129,10 +308,87 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
unsigned long ihk_mc_perfctr_read(int counter)
{
if (counter < 0 || counter >= X86_IA32_NUM_PERF_COUNTERS) {
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
return rdpmc(counter);
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
retval = rdpmc(counter);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
retval = rdpmc((1 << 30) + (counter - X86_IA32_BASE_FIXED_PERF_COUNTERS));
}
else {
retval = -EINVAL;
}
return retval;
}
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
unsigned int idx = 0;
unsigned long retval = 0;
unsigned long cnt_bit = 0;
if (counter < 0) {
return -EINVAL;
}
cnt_bit = 1UL << counter;
if ( cnt_bit & X86_IA32_PERF_COUNTERS_MASK ) {
// read generic pmc
idx = MSR_IA32_PMC0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else if ( cnt_bit & X86_IA32_FIXED_PERF_COUNTERS_MASK ) {
// read fixed pmc
idx = MSR_IA32_FIXED_CTR0 + counter;
retval = (unsigned long) rdmsr(idx);
}
else {
retval = -EINVAL;
}
return retval;
}
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config, unsigned long pmc_status)
{
int ret = -1;
int i = 0;
if(*type == PERF_TYPE_HARDWARE) {
switch(*config){
case PERF_COUNT_HW_INSTRUCTIONS :
*type = PERF_TYPE_RAW;
*config = 0x5300c0;
break;
default :
// Unexpected config
return -1;
}
}
else if(*type != PERF_TYPE_RAW) {
return -1;
}
// find avail generic counter
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
if(!(pmc_status & (1 << i))) {
ret = i;
break;
}
}
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -5,6 +5,8 @@
* implements x86's vsyscall
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 Hitachi, Ltd.
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
@ -16,20 +18,93 @@
*/
#include <syscall.h>
#include <ihk/atomic.h>
#include <arch/cpu.h>
extern int vsyscall_gettimeofday(void *tv, void *tz)
extern int vsyscall_gettimeofday(struct timeval *tv, void *tz)
__attribute__ ((section (".vsyscall.gettimeofday")));
int vsyscall_gettimeofday(void *tv, void *tz)
struct tod_data_s tod_data
__attribute__ ((section(".vsyscall.gettimeofday.data"))) = {
.do_local = 0,
.version = IHK_ATOMIC64_INIT(0),
};
static inline void cpu_pause_for_vsyscall(void)
{
asm volatile ("pause" ::: "memory");
return;
} /* cpu_pause_for_vsyscall() */
static inline void calculate_time_from_tsc(struct timespec *ts)
{
long ver;
unsigned long current_tsc;
__time_t sec_delta;
long ns_delta;
for (;;) {
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
/* settimeofday() is in progress */
cpu_pause_for_vsyscall();
}
rmb();
*ts = tod_data.origin;
rmb();
if (ver == ihk_atomic64_read(&tod_data.version)) {
break;
}
/* settimeofday() has intervened */
cpu_pause_for_vsyscall();
}
current_tsc = rdtsc();
sec_delta = current_tsc / tod_data.clocks_per_sec;
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
ts->tv_sec += sec_delta;
ts->tv_nsec += ns_delta;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
return;
} /* calculate_time_from_tsc() */
int vsyscall_gettimeofday(struct timeval *tv, void *tz)
{
int error;
struct timespec ats;
if (!tv && !tz) {
/* nothing to do */
return 0;
}
/* Do it locally if supported */
if (!tz && tod_data.do_local) {
calculate_time_from_tsc(&ats);
tv->tv_sec = ats.tv_sec;
tv->tv_usec = ats.tv_nsec / 1000;
return 0;
}
/* Otherwise syscall */
asm ("syscall" : "=a" (error)
: "a" (__NR_gettimeofday), "D" (tv), "S" (tz)
: "%rcx", "%r11", "memory");
if (error) {
*(int *)0 = 0; /* i.e. raise(SIGSEGV) */
}
return error;
}
} /* vsyscall_gettimeofday() */
extern long vsyscall_time(void *tp)
__attribute__ ((section (".vsyscall.time")));
@ -58,3 +133,17 @@ long vsyscall_time(void *tp)
return t;
}
extern int vsyscall_getcpu(unsigned *cpup, unsigned *nodep, void *tcachep)
__attribute__ ((section (".vsyscall.getcpu")));
int vsyscall_getcpu(unsigned *cpup, unsigned *nodep, void *tcachep)
{
int error;
asm ("syscall" : "=a" (error)
: "a" (__NR_getcpu), "D" (cpup), "S" (nodep), "d" (tcachep)
: "%rcx", "%r11", "memory");
return error;
}

View File

@ -0,0 +1,28 @@
# irqbalance is a daemon process that distributes interrupts across
# CPUS on SMP systems. The default is to rebalance once every 10
# seconds. This is the environment file that is specified to systemd via the
# EnvironmentFile key in the service unit file (or via whatever method the init
# system you're using has.
#
# ONESHOT=yes
# after starting, wait for a minute, then look at the interrupt
# load and balance it once; after balancing exit and do not change
# it again.
#IRQBALANCE_ONESHOT=
#
# IRQBALANCE_BANNED_CPUS
# 64 bit bitmask which allows you to indicate which cpu's should
# be skipped when reblancing irqs. Cpu numbers which have their
# corresponding bits set to one in this mask will not have any
# irq's assigned to them on rebalance
#
IRQBALANCE_BANNED_CPUS=%mask%
#
# IRQBALANCE_ARGS
# append any args here to the irqbalance daemon as documented in the man page
#
IRQBALANCE_ARGS=--banirq=%banirq%

View File

@ -0,0 +1,10 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=@ETCDIR@/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -0,0 +1,46 @@
#!/bin/bash -x
# \file arch/x86/tools/mcreboot-builtin-x86.sh.in
# License details are found in the file LICENSE.
# \brief
# mckernel boot script
# \author Masamichi Takagi <masamichi.takagi@riken.jp> \par
# Copyright (C) 2014 RIKEN AICS
# HISTORY:
#
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
kill -9 `pidof mcexec`
if lsmod | grep mcctrl > /dev/null 2>&1; then
rmmod mcctrl || exit 1
fi
if lsmod | grep dcfa > /dev/null 2>&1; then
rmmod dcfa || exit 1
fi
if lsmod | grep ihk_builtin > /dev/null 2>&1; then
rmmod ihk_builtin || exit 1
fi
if lsmod | grep ihk > /dev/null 2>&1; then
rmmod ihk || exit 1
fi
insmod "$KMODDIR/ihk.ko" &&
insmod "$KMODDIR/ihk_builtin.ko" &&
"$SBINDIR/ihkconfig" 0 create &&
NCORE=`dmesg | grep -E 'SHIMOS: CPU Status:'|awk '{split($0,a," "); for (i = 1; i <= length(a); i++) { if(a[i] ~ /2/) {count++}} print count;}'`
MEM=`free -g | grep -E 'Mem:' | awk '{print int($2/4)}'`
"$SBINDIR/ihkosctl" 0 alloc "$NCORE" "$MEM"g &&
"$SBINDIR/ihkosctl" 0 load "$KERNDIR/mckernel.img" &&
"$SBINDIR/ihkosctl" 0 kargs hidos osnum=0 &&
"$SBINDIR/ihkosctl" 0 boot &&
sleep 1 &&
"$SBINDIR/ihkosctl" 0 kmsg &&
insmod "$KMODDIR/mcctrl.ko" &&
sleep 1 &&
"$SBINDIR/ihkosctl" 0 kmsg &&
exit 0

View File

@ -0,0 +1,475 @@
#!/bin/bash
# IHK SMP-x86 example boot script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2014 RIKEN AICS
#
# This is an example script for loading IHK, configuring a partition and
# booting McKernel on it. Unless specific CPUs and memory are requested,
# the script reserves half of the CPU cores and 512MB of RAM from
# NUMA node 0 when IHK is loaded for the first time.
# Otherwise, it destroys the current McKernel instance and reboots it using
# the same set of resources as it used previously.
# Note that the script does not output anything unless an error occurs.
prefix="@prefix@"
BINDIR="${prefix}/bin"
SBINDIR="${prefix}/sbin"
ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
mem="512M@0"
cpus=""
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
INTERVAL=1
LOGMODE=0
facility="LOG_LOCAL6"
chown_option=`logname 2> /dev/null`
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" -o "`systemctl status irqbalance.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
else
irqbalance_used="no"
fi
turbo=""
while getopts :ti:k:c:m:o:f: OPT
do
case ${OPT} in
f) facility=${OPTARG}
;;
o) chown_option=${OPTARG}
;;
i) INTERVAL=${OPTARG}
expr "${INTERVAL}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -i value" >&2
exit 1
fi
if [ ${INTERVAL} -le 0 ]
then
echo "invalid -i value" >&2
exit 1
fi
;;
k) LOGMODE=${OPTARG}
expr "${LOGMODE}" + 1 > /dev/null 2>&1
if [ $? -ge 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
if [ ${LOGMODE} -lt 0 -o ${LOGMODE} -gt 2 ]
then
echo "invalid -k value" >&2
exit 1
fi
;;
c) cpus=${OPTARG}
;;
m) mem=${OPTARG}
;;
t) turbo="turbo"
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
#
# Revert any state that has been initialized before the error occured.
#
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
os_created)
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "warning: failed to destroy LWK instance $ind" >&2
fi
done
fi
;&
mcctrl_loaded)
rmmod mcctrl || echo "warning: failed to remove mcctrl" >&2
;&
mem_reserved)
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "warning: failed to release memory" >&2
fi
fi
;&
cpus_reserved)
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "warning: failed to release CPUs" >&2
fi
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 || echo "warning: failed to remove ihk_smp_x86" >&2
;&
ihk_loaded)
rmmod ihk || echo "warning: failed to remove ihk" >&2
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
# Nothing more to revert
;;
esac
exit 1
}
ihk_ikc_irq_core=0
release=`uname -r`
major=`echo ${release} | sed -e 's/^\([0-9]*\).*/\1/'`
minor=`echo ${release} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/'`
patch=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/'`
linux_version_code=`expr \( ${major} \* 65536 \) + \( ${minor} \* 256 \) + ${patch}`
rhel_release=`echo ${release} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/'`
if [ "${release}" == "${rhel_release}" ]; then
rhel_release="";
fi
enable_mcoverlay="no"
if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
if [ "${rhel_release}" == "" ]; then
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
enable_mcoverlay="yes"
fi
if [ ${linux_version_code} -ge 263680 -a ${linux_version_code} -lt 263936 ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
enable_mcoverlay="yes"
fi
fi
fi
# Figure out CPUs if not requested by user
if [ "$cpus" == "" ]; then
# Get the number of CPUs on NUMA node 0
nr_cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $4}' | wc -l`
# Use the second half of the cores
let nr_cpus="$nr_cpus / 2"
cpus=`lscpu --parse | awk -F"," '{if ($4 == 0) print $1}' | tail -n $nr_cpus | xargs echo -n | sed 's/ /,/g'`
if [ "$cpus" == "" ]; then
echo "error: no available CPUs on NUMA node 0?" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "$enable_mcoverlay" == "yes" ]; then
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "error: removing mcoverlay" >&2
error_exit "initial"
fi
fi
fi
# Stop irqbalance
if [ "${irqbalance_used}" == "yes" ]; then
systemctl stop irqbalance_mck.service 2>/dev/null
if ! systemctl stop irqbalance.service 2>/dev/null ; then
echo "error: stopping irqbalance" >&2
error_exit "initial"
fi;
fi
# Start mcklogd. Note that McKernel blocks when kmsg buffer is full
# with '-k 1' until mcklogd unblocks it so starting mcklogd must preceed
# booting McKernel
if [ ${LOGMODE} -ne 0 ]; then
# Stop mcklogd which has survived McKernel shutdown because
# mcstop+release.sh is not used
pkill mcklogd
SBINDIR=${SBINDIR} ${SBINDIR}/mcklogd -i ${INTERVAL} -f ${facility}
fi
# Load IHK if not loaded
if [ "`lsmod | grep ihk`" == "" ]; then
if ! insmod ${KMODDIR}/ihk.ko; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
fi
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Load IHK-SMP if not loaded and reserve CPUs and memory
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then
ihk_irq=""
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
ihk_irq=$i
break
fi
done
if [ "$ihk_irq" == "" ]; then
echo "error: no IRQ available" >&2
error_exit "ihk_loaded"
fi
if ! insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core; then
echo "error: loading ihk-smp-x86" >&2
error_exit "ihk_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve cpu ${cpus}; then
echo "error: reserving CPUs" >&2;
error_exit "ihk_smp_loaded"
fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
error_exit "cpus_reserved"
fi
fi
# Load mcctrl if not loaded
if [ "`lsmod | grep mcctrl`" == "" ]; then
if ! insmod ${KMODDIR}/mcctrl.ko; then
echo "error: inserting mcctrl.ko" >&2
error_exit "mem_reserved"
fi
fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
error_exit "mcctrl_loaded"
fi
done
fi
# Create OS instance
if ! ${SBINDIR}/ihkconfig 0 create; then
echo "error: creating OS instance" >&2
error_exit "mcctrl_loaded"
fi
# Assign CPUs
if ! ${SBINDIR}/ihkosctl 0 assign cpu ${cpus}; then
echo "error: assign CPUs" >&2
error_exit "os_created"
fi
# Assign memory
if ! ${SBINDIR}/ihkosctl 0 assign mem ${mem}; then
echo "error: assign memory" >&2
error_exit "os_created"
fi
# Load kernel image
if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
echo "error: loading kernel image: ${KERNDIR}/mckernel.img" >&2
error_exit "os_created"
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos ksyslogd=${LOGMODE} $turbo"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
# Boot OS instance
if ! ${SBINDIR}/ihkosctl 0 boot; then
echo "error: booting" >&2
error_exit "os_created"
fi
# Set device file ownership
if ! chown ${chown_option} /dev/mcd* /dev/mcos*; then
echo "warning: failed to chown device files" >&2
fi
# Overlay /proc, /sys with McKernel specific contents
if [ "$enable_mcoverlay" == "yes" ]; then
if [ ! -e /tmp/mcos ]; then mkdir -p /tmp/mcos; fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then mkdir -p /tmp/mcos/linux_proc; fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! insmod ${KMODDIR}/mcoverlay.ko; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then mkdir -p /tmp/mcos/mcos0_proc; fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then mkdir -p /tmp/mcos/mcos0_proc_upper; fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then mkdir -p /tmp/mcos/mcos0_proc_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then mkdir -p /tmp/mcos/mcos0_sys; fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then mkdir -p /tmp/mcos/mcos0_sys_upper; fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then mkdir -p /tmp/mcos/mcos0_sys_work; fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
fi
# Start irqbalance with CPUs and IRQ for McKernel banned
if [ "${irqbalance_used}" == "yes" ]; then
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi;
ncpus=`lscpu | grep -E '^CPU\(s\):' | awk '{print $2}'`
smp_affinity_mask=`echo $cpus | ncpus=$ncpus perl -e 'while(<>){@tokens = split /,/;foreach $token (@tokens) {@nums = split /-/,$token; for($num = $nums[0]; $num <= $nums[$#nums]; $num++) {$ndx=int($num/32); $mask[$ndx] |= (1<<($num % 32))}}} $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if($j != $nint32s - 1){print ",";} $nblks = $j == $nint32s - 1 ? int(($ENV{'ncpus'} % 32)/4) : 8; for($i = $nblks - 1;$i >= 0;$i--){ printf("%01x",($mask[$j] >> ($i*4)) & 0xf);}}'`
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
fi
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > $ETCDIR/irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
if ! systemctl start irqbalance_mck.service 2>/dev/null ; then
echo "error: starting irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
fi
# echo cpus=$cpus mask=$smp_affinity_mask banirq=$banirq
fi

View File

@ -0,0 +1,16 @@
#!/bin/bash
# \file arch/x86/tools/mcshutdown-attached-mic.sh.in
# License details are found in the file LICENSE.
# \brief
# mckernel shutdown script
#
# \author McKernel Development Team
#
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
"$SBINDIR/ihkosctl" 0 shutdown

View File

@ -0,0 +1,115 @@
#!/bin/bash
# IHK SMP-x86 example McKernel unload script.
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2015 RIKEN AICS
#
# This is an example script for destroying McKernel and releasing IHK resources
# Note that the script does no output anything unless an error occurs.
prefix="@prefix@"
BINDIR="@BINDIR@"
SBINDIR="@SBINDIR@"
ETCDIR=@ETCDIR@
KMODDIR="@KMODDIR@"
KERNDIR="@KERNDIR@"
mem=""
cpus=""
# No SMP module? Exit.
if [ "`lsmod | grep ihk_smp_x86`" == "" ]; then exit 0; fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
if ! ${SBINDIR}/ihkconfig 0 destroy $ind; then
echo "error: destroying LWK instance $ind failed" >&2
exit 1
fi
done
fi
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
exit 1
fi
cpus=`${SBINDIR}/ihkconfig 0 query cpu`
if [ "${cpus}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release cpu $cpus > /dev/null; then
echo "error: releasing CPUs" >&2
exit 1
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
fi
# Remove delegator if loaded
if [ "`lsmod | grep mcctrl`" != "" ]; then
if ! rmmod mcctrl; then
echo "error: removing mcctrl" >&2
exit 1
fi
fi
# Remove mcoverlay if loaded
if [ "`lsmod | grep mcoverlay`" != "" ]; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay; then
echo "warning: failed to remove mcoverlay" >&2
fi
fi
# Remove SMP module
if [ "`lsmod | grep ihk_smp_x86`" != "" ]; then
if ! rmmod ihk_smp_x86; then
echo "error: removing ihk_smp_x86" >&2
exit 1
fi
fi
# Remove core module
if [ "`lsmod | grep -E 'ihk\s' | awk '{print $1}'`" != "" ]; then
if ! rmmod ihk; then
echo "error: removing ihk" >&2
exit 1
fi
fi
# Stop mcklogd
pkill mcklogd
# Start irqbalance with the original settings
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi

5050
configure vendored

File diff suppressed because it is too large Load Diff

View File

@ -24,13 +24,30 @@ AC_ARG_WITH([kernelsrc],
AC_ARG_WITH([target],
AC_HELP_STRING(
[--with-target={attached-mic | builtin-mic | builtin-x86}],[target, default is attached-mic]),
[--with-target={attached-mic | builtin-mic | builtin-x86 | smp-x86}],[target, default is attached-mic]),
[WITH_TARGET=$withval],[WITH_TARGET=yes])
AC_ARG_WITH([system_map],
AS_HELP_STRING(
[--with-system_map=path],[Path to 'System.map file', default is /boot/System.map-uname_r]),
[WITH_SYSTEM_MAP=$withval],[WITH_SYSTEM_MAP=yes])
AC_ARG_ENABLE([dcfa],
[AS_HELP_STRING(
[--enable-dcfa],[Enable DCFA modules])],[],[enable_dcfa=no])
AC_ARG_ENABLE([memdump],
AC_HELP_STRING([--enable-memdump],
[enable dumping memory and analyzing a dump]),
[ENABLE_MEMDUMP=$enableval],
[ENABLE_MEMDUMP=default])
AC_ARG_ENABLE([mcoverlayfs],
AC_HELP_STRING([--enable-mcoverlayfs],
[enable mcoverlayfs implementation]),
[ENABLE_MCOVERLAYFS=$enableval],
[ENABLE_MCOVERLAYFS=yes])
case "X$WITH_KERNELSRC" in
Xyes | Xno | X)
WITH_KERNELSRC='/lib/modules/`uname -r`/build'
@ -49,9 +66,27 @@ fi
test "x$prefix" = xNONE && prefix="$ac_default_prefix"
case $WITH_TARGET in
attached-mic)
attached-mic|builtin-x86|smp-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
CFLAGS="$CFLAGS -ffreestanding -fno-tree-loop-distribute-patterns"
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
esac
case $WITH_TARGET in
attached-mic)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -69,12 +104,6 @@ case $WITH_TARGET in
fi
;;
builtin-mic)
ARCH=k1om
AC_CHECK_PROG(XCC,
[x86_64-$ARCH-linux-gcc],
[x86_64-$ARCH-linux-gcc],
[no])
CC=$XCC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -92,9 +121,6 @@ case $WITH_TARGET in
fi
;;
builtin-x86)
ARCH=`uname -m`
AC_PROG_CC
XCC=$CC
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/attached/kernel"
fi
@ -111,6 +137,26 @@ case $WITH_TARGET in
MANDIR="$prefix/attached/man"
fi
;;
smp-x86)
if test "X$KERNDIR" = X; then
KERNDIR="$prefix/smp-x86/kernel"
fi
if test "X$BINDIR" = X; then
BINDIR="$prefix/bin"
fi
if test "X$SBINDIR" = X; then
SBINDIR="$prefix/sbin"
fi
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-x86/man"
fi
;;
*)
AC_MSG_ERROR([target $WITH_TARGET is unknwon])
;;
@ -119,6 +165,117 @@ esac
KDIR="$WITH_KERNELSRC"
TARGET="$WITH_TARGET"
MCCTRL_LINUX_SYMTAB=""
case "X$WITH_SYSTEM_MAP" in
Xyes | Xno | X)
MCCTRL_LINUX_SYMTAB=""
;;
*)
MCCTRL_LINUX_SYMTAB="$WITH_SYSTEM_MAP"
;;
esac
AC_MSG_CHECKING([[for System.map]])
if test -f "$MCCTRL_LINUX_SYMTAB"; then
MCCTRL_LINUX_SYMTAB="$MCCTRL_LINUX_SYMTAB"
elif test -f "/boot/System.map-`uname -r`"; then
MCCTRL_LINUX_SYMTAB="/boot/System.map-`uname -r`"
elif test -f "$KDIR/System.map"; then
MCCTRL_LINUX_SYMTAB="$KDIR/System.map"
fi
if test "$MCCTRL_LINUX_SYMTAB" == ""; then
AC_MSG_ERROR([could not find])
fi
if test -z "`eval cat $MCCTRL_LINUX_SYMTAB`"; then
AC_MSG_ERROR([could not read System.map file, no read permission?])
fi
AC_MSG_RESULT([$MCCTRL_LINUX_SYMTAB])
MCCTRL_LINUX_SYMTAB_CMD="cat $MCCTRL_LINUX_SYMTAB"
# MCCTRL_FIND_KSYM(SYMBOL)
# ------------------------------------------------------
# Search System.map for address of the given symbol and
# do one of three things in config.h:
# If not found, leave MCCTRL_KSYM_foo undefined
# If found to be exported, "#define MCCTRL_KSYM_foo 0"
# If found not to be exported, "#define MCCTRL_KSYM_foo 0x<value>"
AC_DEFUN([MCCTRL_FIND_KSYM],[
AC_MSG_CHECKING([[System.map for symbol $1]])
mcctrl_addr=`eval $MCCTRL_LINUX_SYMTAB_CMD | grep " $1\$" | cut -d\ -f1`
if test -z $mcctrl_addr; then
AC_MSG_RESULT([not found])
else
mcctrl_result=$mcctrl_addr
mcctrl_addr="0x$mcctrl_addr"
m4_ifval([$2],[],[
if `eval $MCCTRL_LINUX_SYMTAB_CMD | grep " __ksymtab_$1\$" >/dev/null`; then
mcctrl_result="exported"
mcctrl_addr="0"
fi
])
AC_MSG_RESULT([$mcctrl_result])
AC_DEFINE_UNQUOTED(MCCTRL_KSYM_[]$1,$mcctrl_addr,[Define to address of kernel symbol $1, or 0 if exported])
fi
])
MCCTRL_FIND_KSYM([sys_mount])
MCCTRL_FIND_KSYM([sys_umount])
MCCTRL_FIND_KSYM([sys_unshare])
MCCTRL_FIND_KSYM([zap_page_range])
MCCTRL_FIND_KSYM([vdso_image_64])
MCCTRL_FIND_KSYM([vdso_start])
MCCTRL_FIND_KSYM([vdso_end])
MCCTRL_FIND_KSYM([vdso_pages])
MCCTRL_FIND_KSYM([__vvar_page])
MCCTRL_FIND_KSYM([hpet_address])
MCCTRL_FIND_KSYM([hv_clock])
MCCTRL_FIND_KSYM([sys_readlink])
case $ENABLE_MEMDUMP in
yes|no|auto)
;;
default)
if test "x$WITH_TARGET" = "xsmp-x86" ; then
ENABLE_MEMDUMP=auto
else
ENABLE_MEMDUMP=no
fi
;;
*)
AC_MSG_ERROR([unknown memdump argument: $ENABLE_MEMDUMP])
;;
esac
if test "x$ENABLE_MEMDUMP" != "xno" ; then
enableval=yes
AC_CHECK_LIB([bfd],[bfd_init],[],[enableval=no])
AC_CHECK_HEADER([bfd.h],[],[enableval=no])
if test "x$ENABLE_MEMDUMP" = "xyes" -a "x$enableval" = "xno" ; then
AC_MSG_ERROR([memdump feature needs bfd.h and libbfd a.k.a bunutils-devel])
fi
ENABLE_MEMDUMP=$enableval
fi
if test "x$ENABLE_MEMDUMP" = "xyes" ; then
AC_MSG_NOTICE([memdump feature is enabled])
AC_DEFINE([ENABLE_MEMDUMP],[1],[whether memdump feature is enabled])
uncomment_if_ENABLE_MEMDUMP=''
else
AC_MSG_NOTICE([memdump feature is disabled])
uncomment_if_ENABLE_MEMDUMP='#'
fi
if test "x$ENABLE_MCOVERLAYFS" = "xyes" ; then
AC_DEFINE([ENABLE_MCOVERLAYFS],[1],[whether mcoverlayfs is enabled])
AC_MSG_NOTICE([mcoverlayfs is enabled])
else
AC_MSG_NOTICE([mcoverlayfs is disabled])
fi
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
@ -126,9 +283,12 @@ AC_SUBST(KDIR)
AC_SUBST(TARGET)
AC_SUBST(BINDIR)
AC_SUBST(SBINDIR)
AC_SUBST(ETCDIR)
AC_SUBST(KMODDIR)
AC_SUBST(KERNDIR)
AC_SUBST(MANDIR)
AC_SUBST(CFLAGS)
AC_SUBST(ENABLE_MCOVERLAYFS)
AC_SUBST(IHK_VERSION)
AC_SUBST(MCKERNEL_VERSION)
@ -136,16 +296,29 @@ AC_SUBST(DCFA_VERSION)
AC_SUBST(IHK_RELEASE_DATE)
AC_SUBST(MCKERNEL_RELEASE_DATE)
AC_SUBST(DCFA_RESEASE_DATE)
AC_SUBST(uncomment_if_ENABLE_MEMDUMP)
AC_CONFIG_HEADERS([executer/config.h])
AC_CONFIG_FILES([
Makefile
executer/user/Makefile
executer/kernel/Makefile
executer/kernel/mcctrl/Makefile
executer/kernel/mcctrl/arch/x86_64/Makefile
executer/kernel/mcoverlayfs/Makefile
executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile
executer/kernel/mcoverlayfs/linux-4.0.9/Makefile
executer/kernel/mcoverlayfs/linux-4.6.7/Makefile
kernel/Makefile
kernel/Makefile.build
arch/x86/tools/mcreboot-attached-mic.sh
arch/x86/tools/mcshutdown-attached-mic.sh
arch/x86/tools/mcreboot-builtin-x86.sh
arch/x86/tools/mcreboot-smp-x86.sh
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
])
AS_IF([test "x$enable_dcfa" = xyes], [

94
executer/config.h.in Normal file
View File

@ -0,0 +1,94 @@
/* executer/config.h.in. Generated from configure.ac by autoheader. */
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* Define to 1 if you have the <inttypes.h> header file. */
#undef HAVE_INTTYPES_H
/* Define to 1 if you have the `bfd' library (-lbfd). */
#undef HAVE_LIBBFD
/* Define to 1 if you have the <memory.h> header file. */
#undef HAVE_MEMORY_H
/* Define to 1 if you have the <stdint.h> header file. */
#undef HAVE_STDINT_H
/* Define to 1 if you have the <stdlib.h> header file. */
#undef HAVE_STDLIB_H
/* Define to 1 if you have the <strings.h> header file. */
#undef HAVE_STRINGS_H
/* Define to 1 if you have the <string.h> header file. */
#undef HAVE_STRING_H
/* Define to 1 if you have the <sys/stat.h> header file. */
#undef HAVE_SYS_STAT_H
/* Define to 1 if you have the <sys/types.h> header file. */
#undef HAVE_SYS_TYPES_H
/* Define to 1 if you have the <unistd.h> header file. */
#undef HAVE_UNISTD_H
/* Define to address of kernel symbol __vvar_page, or 0 if exported */
#undef MCCTRL_KSYM___vvar_page
/* Define to address of kernel symbol hpet_address, or 0 if exported */
#undef MCCTRL_KSYM_hpet_address
/* Define to address of kernel symbol hv_clock, or 0 if exported */
#undef MCCTRL_KSYM_hv_clock
/* Define to address of kernel symbol sys_mount, or 0 if exported */
#undef MCCTRL_KSYM_sys_mount
/* Define to address of kernel symbol sys_readlink, or 0 if exported */
#undef MCCTRL_KSYM_sys_readlink
/* Define to address of kernel symbol sys_umount, or 0 if exported */
#undef MCCTRL_KSYM_sys_umount
/* Define to address of kernel symbol sys_unshare, or 0 if exported */
#undef MCCTRL_KSYM_sys_unshare
/* Define to address of kernel symbol vdso_end, or 0 if exported */
#undef MCCTRL_KSYM_vdso_end
/* Define to address of kernel symbol vdso_image_64, or 0 if exported */
#undef MCCTRL_KSYM_vdso_image_64
/* Define to address of kernel symbol vdso_pages, or 0 if exported */
#undef MCCTRL_KSYM_vdso_pages
/* Define to address of kernel symbol vdso_start, or 0 if exported */
#undef MCCTRL_KSYM_vdso_start
/* Define to address of kernel symbol zap_page_range, or 0 if exported */
#undef MCCTRL_KSYM_zap_page_range
/* Define to the address where bug reports for this package should be sent. */
#undef PACKAGE_BUGREPORT
/* Define to the full name of this package. */
#undef PACKAGE_NAME
/* Define to the full name and version of this package. */
#undef PACKAGE_STRING
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
/* Define to the home page for this package. */
#undef PACKAGE_URL
/* Define to the version of this package. */
#undef PACKAGE_VERSION
/* Define to 1 if you have the ANSI C header files. */
#undef STDC_HEADERS

View File

@ -38,6 +38,10 @@
#define MCEXEC_UP_SEND_SIGNAL 0x30a02906
#define MCEXEC_UP_GET_CPU 0x30a02907
#define MCEXEC_UP_STRNCPY_FROM_USER 0x30a02908
#define MCEXEC_UP_NEW_PROCESS 0x30a02909
#define MCEXEC_UP_GET_CRED 0x30a0290a
#define MCEXEC_UP_GET_CREDV 0x30a0290b
#define MCEXEC_UP_GET_NODES 0x30a0290c
#define MCEXEC_UP_PREPARE_DMA 0x30a02910
#define MCEXEC_UP_FREE_DMA 0x30a02911
@ -45,6 +49,12 @@
#define MCEXEC_UP_OPEN_EXEC 0x30a02912
#define MCEXEC_UP_CLOSE_EXEC 0x30a02913
#define MCEXEC_UP_SYS_MOUNT 0x30a02914
#define MCEXEC_UP_SYS_UMOUNT 0x30a02915
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_DEBUG_LOG 0x40000000
#define MCEXEC_UP_TRANSFER_TO_REMOTE 0
#define MCEXEC_UP_TRANSFER_FROM_REMOTE 1
@ -67,6 +77,7 @@ struct program_image_section {
};
#define SHELL_PATH_MAX_LEN 1024
#define MCK_RLIM_MAX 20
struct program_load_desc {
int num_sections;
@ -76,6 +87,10 @@ struct program_load_desc {
int err;
int stack_prot;
int pgid;
int cred[8];
int reloc;
char enable_vdso;
char padding[7];
unsigned long entry;
unsigned long user_start;
unsigned long user_end;
@ -90,14 +105,20 @@ struct program_load_desc {
unsigned long args_len;
char *envs;
unsigned long envs_len;
unsigned long rlimit_stack_cur;
unsigned long rlimit_stack_max;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
char shell_path[SHELL_PATH_MAX_LEN];
struct program_image_section sections[0];
};
struct syscall_request {
/* TID of requesting thread */
int rtid;
/*
* TID of target thread. Remote page fault response needs to designate the
* thread that must serve the request, 0 indicates any thread from the pool
*/
int ttid;
unsigned long valid;
unsigned long number;
unsigned long args[6];
@ -116,8 +137,17 @@ struct syscall_load_desc {
unsigned long size;
};
#define IHK_SCD_REQ_THREAD_SPINNING 0
#define IHK_SCD_REQ_THREAD_TO_BE_WOKEN 1
#define IHK_SCD_REQ_THREAD_DESCHEDULED 2
struct syscall_response {
/* TID of the thread that requested the service */
int ttid;
/* TID of the mcexec thread that is serving or has served the request */
int stid;
unsigned long status;
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
@ -156,4 +186,24 @@ struct signal_desc {
char info[128];
};
struct newprocess_desc {
int pid;
};
struct sys_mount_desc {
char *dev_name;
char *dir_name;
char *type;
unsigned long flags;
void *data;
};
struct sys_umount_desc {
char *dir_name;
};
struct sys_unshare_desc {
unsigned long unshare_flags;
};
#endif

View File

@ -1,25 +0,0 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
IHK_BASE=$(src)/../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/include -I$(src)/../include
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -1,916 +0,0 @@
/**
* \file executer/kernel/control.c
* License details are found in the file LICENSE.
* \brief
* kernel module control
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/05 shirasawa propagate error code for prepare image
* 2013/07/02 shirasawa add error handling for prepare_process
* 2013/04/17 nakamura add generic system call forwarding
*/
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/wait.h>
#include <linux/mm.h>
#include <linux/gfp.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/msr.h>
#include <asm/io.h>
#include "mcctrl.h"
#ifdef DEBUG
#define dprintk printk
#else
#define dprintk(...)
#endif
//static DECLARE_WAIT_QUEUE_HEAD(wq_prepare);
//extern struct mcctrl_channel *channels;
int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu);
static long mcexec_prepare_image(ihk_os_t os,
struct program_load_desc * __user udesc)
{
struct program_load_desc desc, *pdesc;
struct ikc_scd_packet isp;
void *args, *envs;
long ret = 0;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
struct mcctrl_per_proc_data *ppd = NULL;
if (copy_from_user(&desc, udesc,
sizeof(struct program_load_desc))) {
return -EFAULT;
}
if (desc.num_sections <= 0 || desc.num_sections > 16) {
printk("# of sections: %d\n", desc.num_sections);
return -EINVAL;
}
pdesc = kmalloc(sizeof(struct program_load_desc) +
sizeof(struct program_image_section)
* desc.num_sections, GFP_KERNEL);
memcpy(pdesc, &desc, sizeof(struct program_load_desc));
if (copy_from_user(pdesc->sections, udesc->sections,
sizeof(struct program_image_section)
* desc.num_sections)) {
kfree(pdesc);
return -EFAULT;
}
pdesc->pid = task_tgid_vnr(current);
if (reserve_user_space(usrdata, &pdesc->user_start, &pdesc->user_end)) {
kfree(pdesc);
return -ENOMEM;
}
args = kmalloc(pdesc->args_len, GFP_KERNEL);
if (copy_from_user(args, pdesc->args, pdesc->args_len)) {
kfree(args);
kfree(pdesc);
return -EFAULT;
}
envs = kmalloc(pdesc->envs_len, GFP_KERNEL);
if (copy_from_user(envs, pdesc->envs, pdesc->envs_len)) {
ret = -EFAULT;
goto free_out;
}
pdesc->args = (void*)virt_to_phys(args);
printk("args: 0x%lX\n", (unsigned long)pdesc->args);
printk("argc: %d\n", *(int*)args);
pdesc->envs = (void*)virt_to_phys(envs);
printk("envs: 0x%lX\n", (unsigned long)pdesc->envs);
printk("envc: %d\n", *(int*)envs);
isp.msg = SCD_MSG_PREPARE_PROCESS;
isp.ref = pdesc->cpu;
isp.arg = virt_to_phys(pdesc);
printk("# of sections: %d\n", pdesc->num_sections);
printk("%p (%lx)\n", pdesc, isp.arg);
pdesc->status = 0;
mcctrl_ikc_send(os, pdesc->cpu, &isp);
wait_event_interruptible(usrdata->wq_prepare, pdesc->status);
if(pdesc->err < 0){
ret = pdesc->err;
goto free_out;
}
ppd = kmalloc(sizeof(*ppd), GFP_ATOMIC);
if (!ppd) {
printk("ERROR: allocating per process data\n");
ret = -ENOMEM;
goto free_out;
}
ppd->pid = pdesc->pid;
ppd->rpgtable = pdesc->rpgtable;
flags = ihk_ikc_spinlock_lock(&usrdata->per_proc_list_lock);
list_add_tail(&ppd->list, &usrdata->per_proc_list);
ihk_ikc_spinlock_unlock(&usrdata->per_proc_list_lock, flags);
dprintk("pid %d, rpgtable: 0x%lx added\n",
ppd->pid, ppd->rpgtable);
if (copy_to_user(udesc, pdesc, sizeof(struct program_load_desc) +
sizeof(struct program_image_section) * desc.num_sections)) {
ret = -EFAULT;
goto free_out;
}
ret = 0;
free_out:
kfree(args);
kfree(pdesc);
kfree(envs);
return ret;
}
int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
{
struct remote_transfer pt;
unsigned long phys, ret = 0;
void *rpm;
#if 0
unsigned long dma_status = 0;
ihk_dma_channel_t channel;
struct ihk_dma_request request;
void *p;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
if (copy_from_user(&pt, upt, sizeof(pt))) {
return -EFAULT;
}
if (pt.size > PAGE_SIZE) {
printk("mcexec_transfer_image(): ERROR: size exceeds PAGE_SIZE\n");
return -EFAULT;
}
phys = ihk_device_map_memory(ihk_os_to_dev(os), pt.rphys, PAGE_SIZE);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, PAGE_SIZE);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, PAGE_SIZE, NULL, 0);
#endif
if (pt.direction == MCEXEC_UP_TRANSFER_TO_REMOTE) {
if (copy_from_user(rpm, pt.userp, pt.size)) {
ret = -EFAULT;
}
}
else if (pt.direction == MCEXEC_UP_TRANSFER_FROM_REMOTE) {
if (copy_to_user(pt.userp, rpm, pt.size)) {
ret = -EFAULT;
}
}
else {
printk("mcexec_transfer_image(): ERROR: invalid direction\n");
ret = -EINVAL;
}
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE);
return ret;
#if 0
p = (void *)__get_free_page(GFP_KERNEL);
if (copy_from_user(p, pt.src, PAGE_SIZE)) {
return -EFAULT;
}
memset(&request, 0, sizeof(request));
request.src_os = NULL;
request.src_phys = virt_to_phys(p);
request.dest_os = os;
request.dest_phys = pt.dest;
request.size = PAGE_SIZE;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
free_page((unsigned long)p);
return 0;
#endif
}
//extern unsigned long last_thread_exec;
static long mcexec_start_image(ihk_os_t os,
struct program_load_desc * __user udesc)
{
struct program_load_desc desc;
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (copy_from_user(&desc, udesc,
sizeof(struct program_load_desc))) {
return -EFAULT;
}
c = usrdata->channels + desc.cpu;
mcctrl_ikc_set_recv_cpu(os, desc.cpu);
usrdata->last_thread_exec = desc.cpu;
isp.msg = SCD_MSG_SCHEDULE_PROCESS;
isp.ref = desc.cpu;
isp.arg = desc.rprocess;
mcctrl_ikc_send(os, desc.cpu, &isp);
return 0;
}
static DECLARE_WAIT_QUEUE_HEAD(signalq);
static long mcexec_send_signal(ihk_os_t os, struct signal_desc *sigparam)
{
struct ikc_scd_packet isp;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct signal_desc sig;
struct mcctrl_signal msig[2];
struct mcctrl_signal *msigp;
int rc;
if (copy_from_user(&sig, sigparam, sizeof(struct signal_desc))) {
return -EFAULT;
}
msigp = msig;
if(((unsigned long)msig & 0xfffffffffffff000L) !=
((unsigned long)(msig + 1) & 0xfffffffffffff000L))
msigp++;
memset(msigp, '\0', sizeof msig);
msigp->sig = sig.sig;
msigp->pid = sig.pid;
msigp->tid = sig.tid;
memcpy(&msigp->info, &sig.info, 128);
c = usrdata->channels;
isp.msg = SCD_MSG_SEND_SIGNAL;
isp.ref = sig.cpu;
isp.pid = sig.pid;
isp.arg = virt_to_phys(msigp);
if((rc = mcctrl_ikc_send(os, sig.cpu, &isp)) < 0){
printk("mcexec_send_signal: mcctrl_ikc_send ret=%d\n", rc);
return rc;
}
wait_event_interruptible(signalq, msigp->cond != 0);
return 0;
}
void
sig_done(unsigned long arg, int err)
{
struct mcctrl_signal *msigp;
msigp = phys_to_virt(arg);
msigp->cond = 1;
wake_up_interruptible(&signalq);
}
static long mcexec_get_cpu(ihk_os_t os)
{
struct ihk_cpu_info *info;
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
return info->n_cpus;
}
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg)
{
struct wait_queue_head_list_node *wqhln = NULL;
struct wait_queue_head_list_node *wqhln_iter;
unsigned long flags;
/* Look up per-process wait queue head with pid */
flags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == pid) {
wqhln = wqhln_iter;
break;
}
}
if (!wqhln) {
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
list_add_tail(&wqhln->list, &c->wq_list);
}
ihk_ikc_spinlock_unlock(&c->wq_list_lock, flags);
wqhln->req = 1;
wake_up(&wqhln->wq_syscall);
return 0;
}
#ifndef DO_USER_MODE
// static int remaining_job, base_cpu, job_pos;
#endif
// extern int num_channels;
// extern int mcctrl_dma_abort;
int mcexec_wait_syscall(ihk_os_t os, struct syscall_wait_desc *__user req)
{
struct syscall_wait_desc swd;
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct wait_queue_head_list_node *wqhln;
struct wait_queue_head_list_node *wqhln_iter;
int ret = 0;
unsigned long irqflags;
#ifndef DO_USER_MODE
unsigned long s, w, d;
#endif
//printk("mcexec_wait_syscall swd=%p req=%p size=%d\n", &swd, req, sizeof(swd.cpu));
if (copy_from_user(&swd, req, sizeof(swd))) {
return -EFAULT;
}
if (swd.cpu >= usrdata->num_channels)
return -EINVAL;
c = get_peer_channel(usrdata, current);
if (c) {
printk("mcexec_wait_syscall:already registered. task %p ch %p\n",
current, c);
return -EBUSY;
}
c = usrdata->channels + swd.cpu;
#ifdef DO_USER_MODE
retry:
/* Prepare per-process wait queue head */
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_KERNEL);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
wqhln->pid = swd.pid;
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
/* First see if there is one wait queue already */
list_for_each_entry(wqhln_iter, &c->wq_list, list) {
if (wqhln_iter->pid == current->tgid) {
kfree(wqhln);
wqhln = wqhln_iter;
list_del(&wqhln->list);
break;
}
}
list_add_tail(&wqhln->list, &c->wq_list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
ret = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
if (ret) {
return -EINTR;
}
/* Remove per-process wait queue head */
irqflags = ihk_ikc_spinlock_lock(&c->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&c->wq_list_lock, irqflags);
kfree(wqhln);
if (c->param.request_va->number == 61 &&
c->param.request_va->args[0] == swd.pid) {
dprintk("pid: %d, tid: %d: SC %d, swd.cpu: %d, WARNING: wait4() for self?\n",
current->tgid,
current->pid,
c->param.request_va->number,
swd.cpu);
return -EINTR;
}
#if 1
mb();
if (!c->param.request_va->valid) {
printk("mcexec_wait_syscall:stray wakeup\n");
goto retry;
}
#endif
#else
while (1) {
c = usrdata->channels + swd.cpu;
rdtscll(s);
if (!usrdata->remaining_job) {
while (!(*c->param.doorbell_va)) {
mb();
cpu_relax();
rdtscll(w);
if (w > s + 1024UL * 1024 * 1024 * 10) {
return -EINTR;
}
}
d = (*c->param.doorbell_va) - 1;
*c->param.doorbell_va = 0;
if (d < 0 || d >= usrdata->num_channels) {
d = 0;
}
usrdata->base_cpu = d;
usrdata->job_pos = 0;
usrdata->remaining_job = 1;
} else {
usrdata->job_pos++;
}
for (; usrdata->job_pos < usrdata->num_channels; usrdata->job_pos++) {
if (base_cpu + job_pos >= num_channels) {
c = usrdata->channels +
(usrdata->base_cpu + usrdata->job_pos - usrdata->num_channels);
} else {
c = usrdata->channels + usrdata->base_cpu + usrdata->job_pos;
}
if (!c) {
continue;
}
if (c->param.request_va &&
c->param.request_va->valid) {
#endif
c->param.request_va->valid = 0; /* ack */
dprintk("SC #%lx, %lx\n",
c->param.request_va->number,
c->param.request_va->args[0]);
register_peer_channel(usrdata, current, c);
if (__do_in_kernel_syscall(os, c, c->param.request_va)) {
if (copy_to_user(&req->sr, c->param.request_va,
sizeof(struct syscall_request))) {
deregister_peer_channel(usrdata, current, c);
return -EFAULT;
}
return 0;
}
deregister_peer_channel(usrdata, current, c);
#ifdef DO_USER_MODE
goto retry;
#endif
#ifndef DO_USER_MODE
if (usrdata->mcctrl_dma_abort) {
return -2;
}
}
}
usrdata->remaining_job = 0;
}
#endif
return 0;
}
long mcexec_pin_region(ihk_os_t os, unsigned long *__user arg)
{
struct prepare_dma_desc desc;
int pin_shift = 16;
int order;
unsigned long a;
if (copy_from_user(&desc, arg, sizeof(struct prepare_dma_desc))) {
return -EFAULT;
}
order = pin_shift - PAGE_SHIFT;
if(desc.size > 0){
order = get_order (desc.size);
}
a = __get_free_pages(GFP_KERNEL, order);
if (!a) {
return -ENOMEM;
}
a = virt_to_phys((void *)a);
if (copy_to_user((void*)desc.pa, &a, sizeof(unsigned long))) {
return -EFAULT;
}
return 0;
}
long mcexec_free_region(ihk_os_t os, unsigned long *__user arg)
{
struct free_dma_desc desc;
int pin_shift = 16;
int order;
if (copy_from_user(&desc, arg, sizeof(struct free_dma_desc))) {
return -EFAULT;
}
order = pin_shift - PAGE_SHIFT;
if(desc.size > 0){
order = get_order (desc.size);
}
if(desc.pa > 0){
free_pages((unsigned long)phys_to_virt(desc.pa), order);
}
return 0;
}
long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
{
struct syscall_load_desc desc;
unsigned long phys;
void *rpm;
if (copy_from_user(&desc, arg, sizeof(struct syscall_load_desc))) {
return -EFAULT;
}
phys = ihk_device_map_memory(ihk_os_to_dev(os), desc.src, desc.size);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, desc.size);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, desc.size, NULL, 0);
#endif
dprintk("mcexec_load_syscall: %s (desc.size: %d)\n", rpm, desc.size);
if (copy_to_user((void *__user)desc.dest, rpm, desc.size)) {
return -EFAULT;
}
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, desc.size);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, desc.size);
/*
ihk_dma_channel_t channel;
struct ihk_dma_request request;
unsigned long dma_status = 0;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
memset(&request, 0, sizeof(request));
request.src_os = os;
request.src_phys = desc.src;
request.dest_os = NULL;
request.dest_phys = desc.dest;
request.size = desc.size;
request.notify = (void *)virt_to_phys(&dma_status);
request.priv = (void *)1;
ihk_dma_request(channel, &request);
while (!dma_status) {
mb();
udelay(1);
}
*/
return 0;
}
long mcexec_ret_syscall(ihk_os_t os, struct syscall_ret_desc *__user arg)
{
struct syscall_ret_desc ret;
struct mcctrl_channel *mc;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
#if 0
ihk_dma_channel_t channel;
struct ihk_dma_request request;
channel = ihk_device_get_dma_channel(ihk_os_to_dev(os), 0);
if (!channel) {
return -EINVAL;
}
#endif
if (copy_from_user(&ret, arg, sizeof(struct syscall_ret_desc))) {
return -EFAULT;
}
mc = usrdata->channels + ret.cpu;
if (!mc) {
return -EINVAL;
}
deregister_peer_channel(usrdata, current, mc);
mc->param.response_va->ret = ret.ret;
if (ret.size > 0) {
/* Host => Accel. Write is fast. */
unsigned long phys;
void *rpm;
phys = ihk_device_map_memory(ihk_os_to_dev(os), ret.dest,
ret.size);
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, ret.size);
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
ret.size, NULL, 0);
#endif
if (copy_from_user(rpm, (void *__user)ret.src, ret.size)) {
return -EFAULT;
}
mb();
mc->param.response_va->status = 1;
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm, ret.size);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, ret.size);
/*
memset(&request, 0, sizeof(request));
request.src_os = NULL;
request.src_phys = ret.src;
request.dest_os = os;
request.dest_phys = ret.dest;
request.size = ret.size;
request.notify_os = os;
request.notify = (void *)mc->param.response_rpa;
request.priv = (void *)1;
ihk_dma_request(channel, &request);
*/
} else {
mb();
mc->param.response_va->status = 1;
}
return 0;
}
LIST_HEAD(mckernel_exec_files);
spinlock_t mckernel_exec_file_lock = SPIN_LOCK_UNLOCKED;
struct mckernel_exec_file {
ihk_os_t os;
pid_t pid;
struct file *fp;
struct list_head list;
};
int mcexec_open_exec(ihk_os_t os, char * __user filename)
{
struct file *file;
struct mckernel_exec_file *mcef;
struct mckernel_exec_file *mcef_iter;
int retval;
file = open_exec(filename);
retval = PTR_ERR(file);
if (IS_ERR(file)) {
goto out_return;
}
mcef = kmalloc(sizeof(*mcef), GFP_KERNEL);
if (!mcef) {
retval = ENOMEM;
goto out_put_file;
}
spin_lock_irq(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == current->tgid) {
allow_write_access(mcef_iter->fp);
fput(mcef_iter->fp);
list_del(&mcef_iter->list);
kfree(mcef_iter);
dprintk("%d open_exec dropped previous executable \n", (int)current->tgid);
break;
}
}
/* Add new exec file to the list */
mcef->os = os;
mcef->pid = current->tgid;
mcef->fp = file;
list_add_tail(&mcef->list, &mckernel_exec_files);
spin_unlock(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)current->tgid, filename);
return 0;
out_put_file:
fput(file);
out_return:
return -retval;
}
int mcexec_close_exec(ihk_os_t os)
{
struct mckernel_exec_file *mcef = NULL;
int found = 0;
spin_lock_irq(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == current->tgid) {
allow_write_access(mcef->fp);
fput(mcef->fp);
list_del(&mcef->list);
kfree(mcef);
found = 1;
dprintk("%d close_exec dropped executable \n", (int)current->tgid);
break;
}
}
spin_unlock(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
}
long mcexec_strncpy_from_user(ihk_os_t os, struct strncpy_from_user_desc * __user arg)
{
struct strncpy_from_user_desc desc;
void *buf;
void *dest;
void *src;
unsigned long remain;
long want;
long copied;
if (copy_from_user(&desc, arg, sizeof(desc))) {
return -EFAULT;
}
buf = (void *)__get_free_page(GFP_KERNEL);
if (!buf) {
return -ENOMEM;
}
dest = desc.dest;
src = desc.src;
remain = desc.n;
want = 0;
copied = 0;
while ((remain > 0) && (want == copied)) {
want = (remain > PAGE_SIZE)? PAGE_SIZE: remain;
copied = strncpy_from_user(buf, src, want);
if (copied == want) {
if (copy_to_user(dest, buf, copied)) {
copied = -EFAULT;
}
}
else if (copied >= 0) {
if (copy_to_user(dest, buf, copied+1)) {
copied = -EFAULT;
}
}
dest += copied;
src += copied;
remain -= copied;
}
desc.result = (copied >= 0)? (desc.n - remain): copied;
free_page((unsigned long)buf);
if (copy_to_user(arg, &desc, sizeof(*arg))) {
return -EFAULT;
}
return 0;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg)
{
switch (req) {
case MCEXEC_UP_PREPARE_IMAGE:
return mcexec_prepare_image(os,
(struct program_load_desc *)arg);
case MCEXEC_UP_TRANSFER:
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
case MCEXEC_UP_START_IMAGE:
return mcexec_start_image(os, (struct program_load_desc *)arg);
case MCEXEC_UP_WAIT_SYSCALL:
return mcexec_wait_syscall(os, (struct syscall_wait_desc *)arg);
case MCEXEC_UP_RET_SYSCALL:
return mcexec_ret_syscall(os, (struct syscall_ret_desc *)arg);
case MCEXEC_UP_LOAD_SYSCALL:
return mcexec_load_syscall(os, (struct syscall_load_desc *)arg);
case MCEXEC_UP_SEND_SIGNAL:
return mcexec_send_signal(os, (struct signal_desc *)arg);
case MCEXEC_UP_GET_CPU:
return mcexec_get_cpu(os);
case MCEXEC_UP_STRNCPY_FROM_USER:
return mcexec_strncpy_from_user(os,
(struct strncpy_from_user_desc *)arg);
case MCEXEC_UP_OPEN_EXEC:
return mcexec_open_exec(os, (char *)arg);
case MCEXEC_UP_CLOSE_EXEC:
return mcexec_close_exec(os);
case MCEXEC_UP_PREPARE_DMA:
return mcexec_pin_region(os, (unsigned long *)arg);
case MCEXEC_UP_FREE_DMA:
return mcexec_free_region(os, (unsigned long *)arg);
}
return -EINVAL;
}
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err)
{
struct program_load_desc *desc = phys_to_virt(arg);
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
desc->err = err;
desc->status = 1;
wake_up_all(&usrdata->wq_prepare);
}

View File

@ -1,135 +0,0 @@
/**
* \file executer/kernel/driver.c
* License details are found in the file LICENSE.
* \brief
* kernel module entry
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
*/
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include "mcctrl.h"
#define OS_MAX_MINOR 64
extern long __mcctrl_control(ihk_os_t, unsigned int, unsigned long);
extern int prepare_ikc_channels(ihk_os_t os);
extern void destroy_ikc_channels(ihk_os_t os);
#ifndef DO_USER_MODE
extern void mcctrl_syscall_init(void);
#endif
extern void procfs_init(int);
extern void procfs_exit(int);
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
unsigned long arg)
{
return __mcctrl_control(os, request, arg);
}
static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_PREPARE_IMAGE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_TRANSFER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_START_IMAGE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_WAIT_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_RET_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_FREE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_OPEN_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
};
static struct ihk_os_user_call mcctrl_uc_proto = {
.num_handlers = sizeof(mcctrl_uchs) / sizeof(mcctrl_uchs[0]),
.handlers = mcctrl_uchs,
};
static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
static int __init mcctrl_init(void)
{
int i;
int rc;
rc = -ENOENT;
for(i = 0; i < OS_MAX_MINOR; i++){
os[i] = ihk_host_find_os(i, NULL);
if (os[i]) {
printk("OS #%d found.\n", i);
rc = 0;
}
}
if(rc){
printk("OS not found.\n");
return rc;
}
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
if (prepare_ikc_channels(os[i]) != 0) {
printk("Preparing syscall channels failed.\n");
os[i] = NULL;
}
}
}
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
for(i = 0; i < OS_MAX_MINOR; i++){
if (os[i]) {
memcpy(mcctrl_uc + i, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[i], mcctrl_uc + i);
if(rc < 0){
destroy_ikc_channels(os[i]);
os[i] = NULL;
}
procfs_init(i);
}
}
return 0;
}
static void __exit mcctrl_exit(void)
{
int i;
printk("mcctrl: unregistered.\n");
for(i = 0; i < OS_MAX_MINOR; i++){
if(os[i]){
ihk_os_unregister_user_call_handlers(os[i], mcctrl_uc + i);
destroy_ikc_channels(os[i]);
procfs_exit(i);
}
}
}
MODULE_LICENSE("GPL v2");
module_init(mcctrl_init);
module_exit(mcctrl_exit);

View File

@ -1,188 +0,0 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <ihk/ihk_host_driver.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int ref;
int osnum;
int pid;
int err;
unsigned long arg;
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
int pid;
int req;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct syscall_params param;
struct ikc_scd_init_param init;
void *dma_buf;
struct list_head wq_list;
ihk_spinlock_t wq_list_lock;
};
struct mcctrl_per_proc_data {
struct list_head list;
int pid;
unsigned long rpgtable; /* per process, not per OS */
};
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
unsigned long *mcctrl_doorbell_va;
unsigned long mcctrl_doorbell_pa;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_list;
ihk_spinlock_t per_proc_list_lock;
void **keys;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
/* syscall.c */
int init_peer_channel_registry(struct mcctrl_usrdata *ud);
int register_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
int deregister_peer_channel(struct mcctrl_usrdata *ud, void *key, struct mcctrl_channel *ch);
struct mcctrl_channel *get_peer_channel(struct mcctrl_usrdata *ud, void *key);
int __do_in_kernel_syscall(ihk_os_t os, struct mcctrl_channel *c, struct syscall_request *sc);
#define PROCFS_NAME_MAX 1000
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
#endif

View File

@ -0,0 +1,27 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
src = @abs_srcdir@
KMODDIR=@KMODDIR@
BINDIR=@BINDIR@
IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o
KBUILD_EXTRA_SYMBOLS = @abs_builddir@/../../../../ihk/linux/core/Module.symvers
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcctrl.ko $(KMODDIR)

View File

@ -0,0 +1 @@
# dummy file

View File

@ -0,0 +1,192 @@
#include <linux/version.h>
#include "../../config.h"
#include "../../mcctrl.h"
#ifdef MCCTRL_KSYM_vdso_image_64
#if MCCTRL_KSYM_vdso_image_64
struct vdso_image *vdso_image = (void *)MCCTRL_KSYM_vdso_image_64;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_start
#if MCCTRL_KSYM_vdso_start
void *vdso_start = (void *)MCCTRL_KSYM_vdso_start;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_end
#if MCCTRL_KSYM_vdso_end
void *vdso_end = (void *)MCCTRL_KSYM_vdso_end;
#endif
#endif
#ifdef MCCTRL_KSYM_vdso_pages
#if MCCTRL_KSYM_vdso_pages
struct page **vdso_pages = (void *)MCCTRL_KSYM_vdso_pages;
#endif
#endif
#ifdef MCCTRL_KSYM___vvar_page
#if MCCTRL_KSYM___vvar_page
void *__vvar_page = (void *)MCCTRL_KSYM___vvar_page;
#endif
#endif
long *hpet_addressp
#ifdef MCCTRL_KSYM_hpet_address
#if MCCTRL_KSYM_hpet_address
= (void *)MCCTRL_KSYM_hpet_address;
#else
= &hpet_address;
#endif
#else
= NULL;
#endif
void **hv_clockp
#ifdef MCCTRL_KSYM_hv_clock
#if MCCTRL_KSYM_hv_clock
= (void *)MCCTRL_KSYM_hv_clock;
#else
= &hv_clock;
#endif
#else
= NULL;
#endif
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
int
reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsigned long *endp)
{
struct vm_area_struct *vma;
unsigned long start = 0L;
unsigned long end;
#define DESIRED_USER_END 0x800000000000
#define GAP_FOR_MCEXEC 0x008000000000UL
end = DESIRED_USER_END;
down_write(&current->mm->mmap_sem);
vma = find_vma(current->mm, 0);
if (vma) {
end = (vma->vm_start - GAP_FOR_MCEXEC) & ~(GAP_FOR_MCEXEC - 1);
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
start = reserve_user_space_common(usrdata, start, end);
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
up_write(&current->mm->mmap_sem);
#endif
if (IS_ERR_VALUE(start)) {
return start;
}
*startp = start;
*endp = end;
return 0;
}
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
long vdso_pa;
struct vdso *vdso;
size_t size;
int i;
vdso_pa = ihk_device_map_memory(dev, vdso_rpa, sizeof(*vdso));
vdso = ihk_device_map_virtual(dev, vdso_pa, sizeof(*vdso), NULL, 0);
/* VDSO pages */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
size = vdso_image->size;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = virt_to_phys(
vdso_image->data + (i * PAGE_SIZE));
}
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
size = vdso_end - vdso_start;
size = (size + PAGE_SIZE - 1) & PAGE_MASK;
vdso->vdso_npages = size >> PAGE_SHIFT;
if (vdso->vdso_npages > VDSO_MAXPAGES) {
vdso->vdso_npages = 0;
goto out;
}
for (i = 0; i < vdso->vdso_npages; ++i) {
vdso->vdso_physlist[i] = page_to_phys(vdso_pages[i]);
}
#endif
/* VVAR page */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-3 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(-2 * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->vvar_is_global = 0;
vdso->vvar_virt = (void *)(vdso->vdso_npages * PAGE_SIZE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
vdso->vvar_is_global = 1;
vdso->vvar_virt = (void *)fix_to_virt(VVAR_PAGE);
vdso->vvar_phys = virt_to_phys(__vvar_page);
#endif
/* HPET page */
if (hpet_addressp && *hpet_addressp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-2 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)(-1 * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
vdso->hpet_is_global = 0;
vdso->hpet_virt = (void *)((vdso->vdso_npages + 1) * PAGE_SIZE);
vdso->hpet_phys = *hpet_addressp;
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,23)
vdso->hpet_is_global = 1;
vdso->hpet_virt = (void *)fix_to_virt(VSYSCALL_HPET);
vdso->hpet_phys = *hpet_addressp;
#endif
}
/* struct pvlock_vcpu_time_info table */
if (hv_clockp && *hv_clockp) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
vdso->pvti_is_global = 0;
vdso->pvti_virt = (void *)(-1 * PAGE_SIZE);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0)
vdso->pvti_is_global = 1;
vdso->pvti_virt = (void *)fix_to_virt(PVCLOCK_FIXMAP_BEGIN);
vdso->pvti_phys = virt_to_phys(*hv_clockp);
#endif
}
out:
wmb();
vdso->busy = 0;
ihk_device_unmap_virtual(dev, vdso, sizeof(*vdso));
ihk_device_unmap_memory(dev, vdso_pa, sizeof(*vdso));
return;
} /* get_vdso_info() */

View File

@ -0,0 +1,282 @@
#include <linux/module.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/binfmts.h>
#include <linux/elfcore.h>
#include <linux/elf.h>
#include <linux/init.h>
#include <linux/file.h>
#include <linux/err.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/version.h>
#include "mcctrl.h"
static int pathcheck(const char *file, const char *list)
{
const char *p;
const char *q;
const char *r;
int l;
if(!*list)
return 1;
p = list;
do{
q = strchr(p, ':');
if(!q)
q = strchr(p, '\0');
for(r = q - 1; r >= p && *r == '/'; r--);
l = r - p + 1;
if(!strncmp(file, p, l) &&
file[l] == '/')
return 1;
p = q + 1;
} while(*q);
return 0;
}
static int load_elf(struct linux_binprm *bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, struct pt_regs *regs
#endif
)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36)
const
#endif
char *wp;
char *cp;
struct file *file;
int rc;
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
typedef struct {
char *name;
char *val;
int l;
} envdata;
envdata env[] = {
{.name = "MCEXEC_WL"},
#define env_mcexec_wl (env[0].val)
{.name = NULL}
};
envdata *ep;
unsigned long off = 0;
struct page *page;
char *addr = NULL;
int i;
unsigned long p;
int st;
int mode;
int cnt[2];
char buf[32];
int l;
int pass;
char *pbuf;
const char *path;
if(bprm->envc == 0)
return -ENOEXEC;
if(memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
return -ENOEXEC;
if(elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
return -ENOEXEC;
if(elf_ex->e_ident[EI_CLASS] != ELFCLASS64)
return -ENOEXEC;
pbuf = kmalloc(1024, GFP_ATOMIC);
if (!pbuf) {
printk("%s: error: allocating pbuf\n", __FUNCTION__);
return -ENOMEM;
}
path = d_path(&bprm->file->f_path, pbuf, 1024);
if(!path || IS_ERR(path))
path = bprm->interp;
cp = strrchr(path, '/');
if(!cp ||
!strcmp(cp, "/mcexec") ||
!strcmp(cp, "/ihkosctl") ||
!strcmp(cp, "/ihkconfig")) {
kfree(pbuf);
return -ENOEXEC;
}
cnt[0] = bprm->argc;
cnt[1] = bprm->envc;
for(pass = 0; pass < 2; pass++){
p = bprm->p;
mode = cnt[0] == 0? 1: 0;
if(pass == 1){
for(ep = env; ep->name; ep++){
if(ep->l)
ep->val = kmalloc(ep->l, GFP_KERNEL);
}
}
ep = NULL;
l = 0;
for(i = 0, st = 0; mode != 2;){
if(st == 0){
off = p & ~PAGE_MASK;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0)
rc = get_user_pages_remote(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#else
rc = get_user_pages(current, bprm->mm,
bprm->p, 1, 0, 1,
&page, NULL);
#endif
if(rc <= 0) {
kfree(pbuf);
return -EFAULT;
}
addr = kmap_atomic(page
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
#endif
);
st = 1;
}
if(addr[off]){
if(mode == 1){
if(ep){
if(pass == 1)
ep->val[l] = addr[off];
l++;
}
else if(addr[off] == '='){
if(l < 32)
buf[l] = '\0';
buf[31] = '\0';
for(ep = env; ep->name; ep++)
if(!strcmp(ep->name, buf))
break;
if(ep->name)
l = 0;
else
ep = NULL;
}
else{
if(l < 32)
buf[l] = addr[off];
l++;
}
}
}
else{
if(mode == 1 && ep){
if(pass == 0){
ep->l = l + 1;
}
else{
ep->val[l] = '\0';
}
}
ep = NULL;
l = 0;
i++;
if(i == cnt[mode]){
i = 0;
mode++;
}
}
off++;
p++;
if(off == PAGE_SIZE || mode == 2){
kunmap_atomic(addr
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0)
, KM_USER0
#endif
);
put_page(page);
st = 0;
}
}
}
if(env_mcexec_wl)
rc = !pathcheck(path, env_mcexec_wl);
else
rc = 1;
for(ep = env; ep->name; ep++)
if(ep->val)
kfree(ep->val);
if(rc) {
kfree(pbuf);
return -ENOEXEC;
}
file = open_exec(MCEXEC_PATH);
if (IS_ERR(file)) {
kfree(pbuf);
return -ENOEXEC;
}
rc = remove_arg_zero(bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
rc = copy_strings_kernel(1, &bprm->interp, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
wp = MCEXEC_PATH;
rc = copy_strings_kernel(1, &wp, bprm);
if (rc){
fput(file);
kfree(pbuf);
return rc;
}
bprm->argc++;
rc = bprm_change_interp(MCEXEC_PATH, bprm);
if (rc < 0){
fput(file);
kfree(pbuf);
return rc;
}
allow_write_access(bprm->file);
fput(bprm->file);
bprm->file = file;
rc = prepare_binprm(bprm);
if (rc < 0){
kfree(pbuf);
return rc;
}
kfree(pbuf);
return search_binary_handler(bprm
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,8,0)
, regs
#endif
);
}
static struct linux_binfmt mcexec_format = {
.module = THIS_MODULE,
.load_binary = load_elf,
};
void __init binfmt_mcexec_init(void)
{
insert_binfmt(&mcexec_format);
}
void binfmt_mcexec_exit(void)
{
unregister_binfmt(&mcexec_format);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,203 @@
/**
* \file executer/kernel/driver.c
* License details are found in the file LICENSE.
* \brief
* kernel module entry
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
*/
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/device.h>
#include "mcctrl.h"
#define OS_MAX_MINOR 64
extern long __mcctrl_control(ihk_os_t, unsigned int, unsigned long,
struct file *);
extern int prepare_ikc_channels(ihk_os_t os);
extern void destroy_ikc_channels(ihk_os_t os);
#ifndef DO_USER_MODE
extern void mcctrl_syscall_init(void);
#endif
extern void procfs_init(int);
extern void procfs_exit(int);
extern void rus_page_hash_init(void);
extern void rus_page_hash_put_pages(void);
extern void binfmt_mcexec_init(void);
extern void binfmt_mcexec_exit(void);
static long mcctrl_ioctl(ihk_os_t os, unsigned int request, void *priv,
unsigned long arg, struct file *file)
{
return __mcctrl_control(os, request, arg, file);
}
static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_PREPARE_IMAGE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_TRANSFER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_START_IMAGE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_WAIT_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_RET_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_LOAD_SYSCALL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SEND_SIGNAL, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CPU, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_NODES, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_STRNCPY_FROM_USER, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_NEW_PROCESS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_PREPARE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_FREE_DMA, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_OPEN_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_DEBUG_LOG, .func = mcctrl_ioctl },
};
static struct ihk_os_user_call mcctrl_uc_proto = {
.num_handlers = sizeof(mcctrl_uchs) / sizeof(mcctrl_uchs[0]),
.handlers = mcctrl_uchs,
};
static struct ihk_os_user_call mcctrl_uc[OS_MAX_MINOR];
static ihk_os_t os[OS_MAX_MINOR];
ihk_os_t osnum_to_os(int n)
{
return os[n];
}
/* OS event notifier implementation */
int mcctrl_os_boot_notifier(int os_index)
{
int rc;
os[os_index] = ihk_host_find_os(os_index, NULL);
if (!os[os_index]) {
printk("mcctrl: error: OS ID %d couldn't be found\n", os_index);
return -EINVAL;
}
if (prepare_ikc_channels(os[os_index]) != 0) {
printk("mcctrl: error: preparing IKC channels for OS %d\n", os_index);
os[os_index] = NULL;
return -EFAULT;
}
memcpy(mcctrl_uc + os_index, &mcctrl_uc_proto, sizeof mcctrl_uc_proto);
rc = ihk_os_register_user_call_handlers(os[os_index], mcctrl_uc + os_index);
if (rc < 0) {
destroy_ikc_channels(os[os_index]);
printk("mcctrl: error: registering callbacks for OS %d\n", os_index);
goto error_cleanup_channels;
}
procfs_init(os_index);
printk("mcctrl: OS ID %d boot event handled\n", os_index);
return 0;
error_cleanup_channels:
destroy_ikc_channels(os[os_index]);
os[os_index] = NULL;
return rc;
}
int mcctrl_os_shutdown_notifier(int os_index)
{
if (os[os_index]) {
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);
destroy_ikc_channels(os[os_index]);
procfs_exit(os_index);
}
os[os_index] = NULL;
printk("mcctrl: OS ID %d shutdown event handled\n", os_index);
return 0;
}
static struct ihk_os_notifier_ops mcctrl_os_notifier_ops = {
.boot = mcctrl_os_boot_notifier,
.shutdown = mcctrl_os_shutdown_notifier,
};
static struct ihk_os_notifier mcctrl_os_notifier = {
.ops = &mcctrl_os_notifier_ops,
};
static int __init mcctrl_init(void)
{
int ret = 0;
int i;
#ifndef DO_USER_MODE
mcctrl_syscall_init();
#endif
for (i = 0; i < OS_MAX_MINOR; ++i) {
os[i] = NULL;
}
rus_page_hash_init();
binfmt_mcexec_init();
if ((ret = ihk_host_register_os_notifier(&mcctrl_os_notifier)) != 0) {
printk("mcctrl: error: registering OS notifier\n");
goto error;
}
printk("mcctrl: initialized successfully.\n");
return ret;
error:
binfmt_mcexec_exit();
rus_page_hash_put_pages();
return ret;
}
static void __exit mcctrl_exit(void)
{
if (ihk_host_deregister_os_notifier(&mcctrl_os_notifier) != 0) {
printk("mcctrl: warning: failed to deregister OS notifier??\n");
}
binfmt_mcexec_exit();
rus_page_hash_put_pages();
printk("mcctrl: unregistered.\n");
}
MODULE_LICENSE("GPL v2");
module_init(mcctrl_init);
module_exit(mcctrl_exit);

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#ifdef ATTACHED_MIC
#include <sysdeps/mic/mic/micconst.h>
@ -34,25 +35,34 @@
#define REQUEST_SHIFT 16
//#define DEBUG_IKC
#ifdef DEBUG_IKC
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) printk(__VA_ARGS__); } while (0)
#define ekprintf(...) printk(__VA_ARGS__)
#endif
//int num_channels;
//struct mcctrl_channel *channels;
void mcexec_prepare_ack(ihk_os_t os, unsigned long arg, int err);
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c);
int mcexec_syscall(struct mcctrl_channel *c, int pid, unsigned long arg);
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg);
void procfs_delete(void *__os, int osnum, unsigned long arg);
void procfs_answer(unsigned long arg, int err);
int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet);
void sig_done(unsigned long arg, int err);
/* XXX: this runs in atomic context! */
static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
void *__packet, void *__os)
{
struct ikc_scd_packet *pisp = __packet;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(__os);
int msg = pisp->msg;
switch (pisp->msg) {
switch (msg) {
case SCD_MSG_INIT_CHANNEL:
mcctrl_ikc_init(__os, pisp->ref, pisp->arg, c);
break;
@ -66,15 +76,7 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
break;
case SCD_MSG_SYSCALL_ONESIDE:
mcexec_syscall(usrdata->channels + pisp->ref, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_CREATE:
procfs_create(__os, pisp->ref, pisp->osnum, pisp->pid, pisp->arg);
break;
case SCD_MSG_PROCFS_DELETE:
procfs_delete(__os, pisp->osnum, pisp->arg);
mcexec_syscall(usrdata, pisp);
break;
case SCD_MSG_PROCFS_ANSWER:
@ -84,6 +86,43 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_SEND_SIGNAL:
sig_done(pisp->arg, pisp->err);
break;
case SCD_MSG_SYSFS_REQ_CREATE:
case SCD_MSG_SYSFS_REQ_MKDIR:
case SCD_MSG_SYSFS_REQ_SYMLINK:
case SCD_MSG_SYSFS_REQ_LOOKUP:
case SCD_MSG_SYSFS_REQ_UNLINK:
case SCD_MSG_SYSFS_REQ_SETUP:
case SCD_MSG_SYSFS_RESP_SHOW:
case SCD_MSG_SYSFS_RESP_STORE:
case SCD_MSG_SYSFS_RESP_RELEASE:
sysfsm_packet_handler(__os, pisp->msg, pisp->err,
pisp->sysfs_arg1, pisp->sysfs_arg2);
break;
case SCD_MSG_PROCFS_TID_CREATE:
case SCD_MSG_PROCFS_TID_DELETE:
procfsm_packet_handler(__os, pisp->msg, pisp->pid, pisp->arg);
break;
case SCD_MSG_GET_VDSO_INFO:
get_vdso_info(__os, pisp->arg);
break;
default:
printk(KERN_ERR "mcctrl:syscall_packet_handler:"
"unknown message (%d.%d.%d.%d.%d.%#lx)\n",
pisp->msg, pisp->ref, pisp->osnum, pisp->pid,
pisp->err, pisp->arg);
break;
}
/*
* SCD_MSG_SYSCALL_ONESIDE holds the packet and frees is it
* mcexec_ret_syscall(), for the rest, free it here.
*/
if (msg != SCD_MSG_SYSCALL_ONESIDE) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)__packet, c);
}
return 0;
}
@ -121,8 +160,6 @@ int mcctrl_ikc_set_recv_cpu(ihk_os_t os, int cpu)
ihk_ikc_channel_set_cpu(usrdata->channels[cpu].c,
ihk_ikc_get_processor_id());
kprintf("Setting the target to %d\n",
ihk_ikc_get_processor_id());
return 0;
}
@ -137,91 +174,26 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu)
}
}
//unsigned long *mcctrl_doorbell_va;
//unsigned long mcctrl_doorbell_pa;
static void mcctrl_ikc_init(ihk_os_t os, int cpu, unsigned long rphys, struct ihk_ikc_channel_desc *c)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet packet;
struct mcctrl_channel *pmc = usrdata->channels + cpu;
unsigned long phys;
struct ikc_scd_init_param *rpm;
if(c->port == 502)
if (c->port == 502) {
pmc = usrdata->channels + usrdata->num_channels - 1;
if (!pmc) {
return;
}
printk("IKC init: cpu=%d port=%d\n", cpu, c->port);
phys = ihk_device_map_memory(ihk_os_to_dev(os), rphys,
sizeof(struct ikc_scd_init_param));
#ifdef CONFIG_MIC
rpm = ioremap_wc(phys, sizeof(struct ikc_scd_init_param));
#else
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param),
NULL, 0);
#endif
pmc->param.request_va =
(void *)__get_free_pages(GFP_KERNEL,
REQUEST_SHIFT - PAGE_SHIFT);
pmc->param.request_pa = virt_to_phys(pmc->param.request_va);
pmc->param.doorbell_va = usrdata->mcctrl_doorbell_va;
pmc->param.doorbell_pa = usrdata->mcctrl_doorbell_pa;
pmc->param.post_va = (void *)__get_free_page(GFP_KERNEL);
pmc->param.post_pa = virt_to_phys(pmc->param.post_va);
memset(pmc->param.doorbell_va, 0, PAGE_SIZE);
memset(pmc->param.request_va, 0, PAGE_SIZE);
memset(pmc->param.post_va, 0, PAGE_SIZE);
pmc->param.response_rpa = rpm->response_page;
pmc->param.response_pa
= ihk_device_map_memory(ihk_os_to_dev(os),
pmc->param.response_rpa,
PAGE_SIZE);
#ifdef CONFIG_MIC
pmc->param.response_va = ioremap_cache(pmc->param.response_pa,
PAGE_SIZE);
#else
pmc->param.response_va = ihk_device_map_virtual(ihk_os_to_dev(os),
pmc->param.response_pa,
PAGE_SIZE, NULL, 0);
#endif
pmc->dma_buf = (void *)__get_free_pages(GFP_KERNEL,
DMA_PIN_SHIFT - PAGE_SHIFT);
rpm->request_page = pmc->param.request_pa;
rpm->doorbell_page = pmc->param.doorbell_pa;
rpm->post_page = pmc->param.post_pa;
if (!pmc) {
kprintf("%s: error: no channel found?\n", __FUNCTION__);
return;
}
packet.msg = SCD_MSG_INIT_CHANNEL_ACKED;
packet.ref = cpu;
packet.arg = rphys;
printk("Request: %lx, Response: %lx, Doorbell: %lx\n",
pmc->param.request_pa, pmc->param.response_rpa,
pmc->param.doorbell_pa);
printk("Request: %p, Response: %p, Doorbell: %p\n",
pmc->param.request_va, pmc->param.response_va,
pmc->param.doorbell_va);
ihk_ikc_send(pmc->c, &packet, 0);
#ifdef CONFIG_MIC
iounmap(rpm);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), rpm,
sizeof(struct ikc_scd_init_param));
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os), phys,
sizeof(struct ikc_scd_init_param));
}
static int connect_handler(struct ihk_ikc_channel_info *param)
@ -240,11 +212,8 @@ static int connect_handler(struct ihk_ikc_channel_info *param)
}
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -261,11 +230,8 @@ static int connect_handler2(struct ihk_ikc_channel_info *param)
param->packet_handler = syscall_packet_handler;
INIT_LIST_HEAD(&usrdata->channels[cpu].wq_list);
spin_lock_init(&usrdata->channels[cpu].wq_list_lock);
usrdata->channels[cpu].c = c;
kprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
dkprintf("syscall: MC CPU %d connected. c=%p\n", cpu, c);
return 0;
}
@ -288,27 +254,29 @@ static struct ihk_ikc_listen_param listen_param2 = {
int prepare_ikc_channels(ihk_os_t os)
{
struct ihk_cpu_info *info;
struct mcctrl_usrdata *usrdata;
int error;
struct mcctrl_usrdata *usrdata;
int i;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata->mcctrl_doorbell_va = (void *)__get_free_page(GFP_KERNEL);
usrdata->mcctrl_doorbell_pa = virt_to_phys(usrdata->mcctrl_doorbell_va);
info = ihk_os_get_cpu_info(os);
if (!info) {
printk("Error: cannot retrieve CPU info.\n");
usrdata->cpu_info = ihk_os_get_cpu_info(os);
usrdata->mem_info = ihk_os_get_memory_info(os);
if (!usrdata->cpu_info || !usrdata->mem_info) {
printk("Error: cannot obtain OS CPU and memory information.\n");
return -EINVAL;
}
if (info->n_cpus < 1) {
if (usrdata->cpu_info->n_cpus < 1) {
printk("Error: # of cpu is invalid.\n");
return -EINVAL;
}
usrdata->num_channels = info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) * usrdata->num_channels,
GFP_KERNEL);
usrdata->num_channels = usrdata->cpu_info->n_cpus + 1;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
return -ENOMEM;
@ -322,33 +290,20 @@ int prepare_ikc_channels(ihk_os_t os)
memcpy(&usrdata->listen_param2, &listen_param2, sizeof listen_param2);
ihk_ikc_listen_port(os, &usrdata->listen_param2);
INIT_LIST_HEAD(&usrdata->per_proc_list);
spin_lock_init(&usrdata->per_proc_list_lock);
error = init_peer_channel_registry(usrdata);
if (error) {
return error;
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
rwlock_init(&usrdata->per_proc_data_hash_lock[i]);
}
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
return 0;
}
void __destroy_ikc_channel(ihk_os_t os, struct mcctrl_channel *pmc)
{
free_pages((unsigned long)pmc->param.request_va,
REQUEST_SHIFT - PAGE_SHIFT);
free_page((unsigned long)pmc->param.post_va);
#ifdef CONFIG_MIC
iounmap(pmc->param.response_va);
#else
ihk_device_unmap_virtual(ihk_os_to_dev(os), pmc->param.response_va,
PAGE_SIZE);
#endif
ihk_device_unmap_memory(ihk_os_to_dev(os),
pmc->param.response_pa, PAGE_SIZE);
free_pages((unsigned long)pmc->dma_buf,
DMA_PIN_SHIFT - PAGE_SHIFT);
return;
}
void destroy_ikc_channels(ihk_os_t os)
@ -356,6 +311,11 @@ void destroy_ikc_channels(ihk_os_t os)
int i;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
printk("%s: WARNING: no mcctrl_usrdata found\n", __FUNCTION__);
return;
}
ihk_host_os_set_usrdata(os, NULL);
for (i = 0; i < usrdata->num_channels; i++) {
@ -366,7 +326,6 @@ void destroy_ikc_channels(ihk_os_t os)
printk("Channel #%d freed.\n", i);
}
}
free_page((unsigned long)usrdata->mcctrl_doorbell_va);
kfree(usrdata->channels);
kfree(usrdata);

View File

@ -0,0 +1,389 @@
/**
* \file mcctrl.h
* License details are found in the file LICENSE.
* \brief
* define data structure
* \author Taku Shimosawa <shimosawa@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2011 - 2012 Taku Shimosawa
* \author Balazs Gerofi <bgerofi@riken.jp> \par
* Copyright (C) 2012 RIKEN AICS
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Tomoki Shirasawa <tomoki.shirasawa.kk@hitachi-solutions.com> \par
* Copyright (C) 2012 - 2013 Hitachi, Ltd.
* \author Balazs Gerofi <bgerofi@is.s.u-tokyo.ac.jp> \par
* Copyright (C) 2013 The University of Tokyo
*/
/*
* HISTORY:
* 2013/11/07 hamada added <sys/resource.h> which is required by getrlimit(2)
* 2013/10/21 nakamura exclude interpreter's segment from data region
* 2013/10/11 nakamura mcexec: add a upper limit of the stack size
* 2013/10/11 nakamura mcexec: add a path prefix for interpreter search
* 2013/10/11 nakamura mcexec: add a interpreter invocation
* 2013/10/08 nakamura add a AT_ENTRY entry to the auxiliary vector
* 2013/09/02 shirasawa add terminate thread
* 2013/08/19 shirasawa mcexec forward signal to MIC process
* 2013/08/07 nakamura add page fault forwarding
* 2013/07/26 shirasawa mcexec print signum or exit status
* 2013/07/17 nakamura create more mcexec thread so that all cpu to be serviced
* 2013/04/17 nakamura add generic system call forwarding
*/
#ifndef HEADER_MCCTRL_H
#define HEADER_MCCTRL_H
#include <linux/fs.h>
#include <ihk/ihk_host_driver.h>
#include <linux/resource.h>
#include <uprotocol.h>
#include <linux/wait.h>
#include <ihk/ikc.h>
#include <ikc/master.h>
#include <ihk/msr.h>
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1
#define SCD_MSG_PREPARE_PROCESS_ACKED 0x2
#define SCD_MSG_PREPARE_PROCESS_NACKED 0x7
#define SCD_MSG_SCHEDULE_PROCESS 0x3
#define SCD_MSG_WAKE_UP_SYSCALL_THREAD 0x14
#define SCD_MSG_INIT_CHANNEL 0x5
#define SCD_MSG_INIT_CHANNEL_ACKED 0x6
#define SCD_MSG_SYSCALL_ONESIDE 0x4
#define SCD_MSG_SEND_SIGNAL 0x8
#define SCD_MSG_CLEANUP_PROCESS 0x9
#define SCD_MSG_GET_VDSO_INFO 0xa
//#define SCD_MSG_GET_CPU_MAPPING 0xc
//#define SCD_MSG_REPLY_GET_CPU_MAPPING 0xd
#define SCD_MSG_PROCFS_CREATE 0x10
#define SCD_MSG_PROCFS_DELETE 0x11
#define SCD_MSG_PROCFS_REQUEST 0x12
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
/* #define SCD_MSG_SYSFS_RESP_CREATE 0x31 */
#define SCD_MSG_SYSFS_REQ_MKDIR 0x32
/* #define SCD_MSG_SYSFS_RESP_MKDIR 0x33 */
#define SCD_MSG_SYSFS_REQ_SYMLINK 0x34
/* #define SCD_MSG_SYSFS_RESP_SYMLINK 0x35 */
#define SCD_MSG_SYSFS_REQ_LOOKUP 0x36
/* #define SCD_MSG_SYSFS_RESP_LOOKUP 0x37 */
#define SCD_MSG_SYSFS_REQ_UNLINK 0x38
/* #define SCD_MSG_SYSFS_RESP_UNLINK 0x39 */
#define SCD_MSG_SYSFS_REQ_SHOW 0x3a
#define SCD_MSG_SYSFS_RESP_SHOW 0x3b
#define SCD_MSG_SYSFS_REQ_STORE 0x3c
#define SCD_MSG_SYSFS_RESP_STORE 0x3d
#define SCD_MSG_SYSFS_REQ_RELEASE 0x3e
#define SCD_MSG_SYSFS_RESP_RELEASE 0x3f
#define SCD_MSG_SYSFS_REQ_SETUP 0x40
#define SCD_MSG_SYSFS_RESP_SETUP 0x41
/* #define SCD_MSG_SYSFS_REQ_CLEANUP 0x42 */
/* #define SCD_MSG_SYSFS_RESP_CLEANUP 0x43 */
#define SCD_MSG_PROCFS_TID_CREATE 0x44
#define SCD_MSG_PROCFS_TID_DELETE 0x45
#define DMA_PIN_SHIFT 21
#define DO_USER_MODE
#define __NR_coredump 999
struct coretable {
int len;
unsigned long addr;
};
struct ikc_scd_packet {
int msg;
int err;
union {
/* for traditional SCD_MSG_* */
struct {
int ref;
int osnum;
int pid;
unsigned long arg;
struct syscall_request req;
unsigned long resp_pa;
};
/* for SCD_MSG_SYSFS_* */
struct {
long sysfs_arg1;
long sysfs_arg2;
long sysfs_arg3;
};
/* SCD_MSG_SCHEDULE_THREAD */
struct {
int ttid;
};
};
char padding[12];
};
struct mcctrl_priv {
ihk_os_t os;
struct program_load_desc *desc;
};
struct ikc_scd_init_param {
unsigned long request_page;
unsigned long response_page;
unsigned long doorbell_page;
unsigned long post_page;
};
struct syscall_post {
unsigned long v[8];
};
struct syscall_params {
unsigned long request_pa;
struct syscall_request *request_va;
unsigned long response_rpa, response_pa;
struct syscall_response *response_va;
unsigned long post_pa;
struct syscall_post *post_va;
unsigned long doorbell_pa;
unsigned long *doorbell_va;
};
struct wait_queue_head_list_node {
struct list_head list;
wait_queue_head_t wq_syscall;
struct task_struct *task;
/* Denotes an exclusive wait for requester TID rtid */
int rtid;
int req;
struct ikc_scd_packet *packet;
};
struct mcctrl_channel {
struct ihk_ikc_channel_desc *c;
struct ikc_scd_init_param init;
void *dma_buf;
};
struct mcctrl_per_thread_data {
struct list_head hash;
struct task_struct *task;
void *data;
};
#define MCCTRL_PER_THREAD_DATA_HASH_SHIFT 8
#define MCCTRL_PER_THREAD_DATA_HASH_SIZE (1 << MCCTRL_PER_THREAD_DATA_HASH_SHIFT)
#define MCCTRL_PER_THREAD_DATA_HASH_MASK (MCCTRL_PER_THREAD_DATA_HASH_SIZE - 1)
struct mcctrl_per_proc_data {
struct list_head hash;
int pid;
unsigned long rpgtable; /* per process, not per OS */
struct list_head wq_list;
struct list_head wq_req_list;
struct list_head wq_list_exact;
ihk_spinlock_t wq_list_lock;
struct list_head per_thread_data_hash[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
rwlock_t per_thread_data_hash_lock[MCCTRL_PER_THREAD_DATA_HASH_SIZE];
};
struct sysfsm_req {
int busy;
int padding;
long lresult;
wait_queue_head_t wq;
};
struct sysfsm_data {
size_t sysfs_bufsize;
void *sysfs_buf;
long sysfs_buf_rpa;
long sysfs_buf_pa;
struct kobject *sysfs_kobj;
struct sysfsm_node *sysfs_root;
struct semaphore sysfs_tree_sem;
struct semaphore sysfs_io_sem;
struct sysfsm_req sysfs_req;
ihk_os_t sysfs_os;
};
static inline int sysfs_inited(struct sysfsm_data *sdp)
{
return !!(sdp->sysfs_buf);
} /* sysfs_inited() */
struct cache_topology {
struct ihk_cache_topology *saved;
cpumask_t shared_cpu_map;
struct list_head chain;
};
struct cpu_topology {
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
cpumask_t core_siblings;
cpumask_t thread_siblings;
struct list_head chain;
struct list_head cache_list;
};
#define NODE_DISTANCE_S_SIZE 1024
struct node_topology {
struct ihk_node_topology *saved;
int mckernel_numa_id;
char mckernel_numa_distance_s[NODE_DISTANCE_S_SIZE];
cpumask_t cpumap;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
#define MCCTRL_PER_PROC_DATA_HASH_SHIFT 7
#define MCCTRL_PER_PROC_DATA_HASH_SIZE (1 << MCCTRL_PER_PROC_DATA_HASH_SHIFT)
#define MCCTRL_PER_PROC_DATA_HASH_MASK (MCCTRL_PER_PROC_DATA_HASH_SIZE - 1)
struct mcctrl_usrdata {
struct ihk_ikc_listen_param listen_param;
struct ihk_ikc_listen_param listen_param2;
ihk_os_t os;
int num_channels;
struct mcctrl_channel *channels;
int remaining_job;
int base_cpu;
int job_pos;
int mcctrl_dma_abort;
unsigned long last_thread_exec;
wait_queue_head_t wq_prepare;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
rwlock_t per_proc_data_hash_lock[MCCTRL_PER_PROC_DATA_HASH_SIZE];
void **keys;
struct sysfsm_data sysfsm_data;
unsigned long cpu_online[CPU_LONGS];
struct ihk_cpu_info *cpu_info;
struct ihk_mem_info *mem_info;
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
};
struct mcctrl_signal {
int cond;
int sig;
int pid;
int tid;
char info[128];
};
int mcctrl_ikc_send(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp);
int mcctrl_ikc_send_msg(ihk_os_t os, int cpu, int msg, int ref, unsigned long arg);
int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
ihk_os_t osnum_to_os(int n);
/* syscall.c */
int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet);
int mcctrl_add_per_proc_data(struct mcctrl_usrdata *ud, int pid,
struct mcctrl_per_proc_data *ppd);
int mcctrl_delete_per_proc_data(struct mcctrl_usrdata *ud, int pid);
inline struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
struct mcctrl_usrdata *ud, int pid);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task, void *data);
int mcctrl_delete_per_thread_data(struct mcctrl_per_proc_data* ppd,
struct task_struct *task);
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(
struct mcctrl_per_proc_data *ppd, struct task_struct *task);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
long ret, int stid);
#define PROCFS_NAME_MAX 768
struct procfs_read {
unsigned long pbuf; /* physical address of the host buffer (request) */
unsigned long offset; /* offset to read (request) */
int count; /* bytes to read (request) */
int eof; /* if eof is detected, 1 otherwise 0. (answer)*/
int ret; /* read bytes (answer) */
int status; /* non-zero if done (answer) */
int newcpu; /* migrated new cpu (answer) */
int readwrite; /* 0:read, 1:write */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
struct procfs_file {
int status; /* status of processing (answer) */
int mode; /* file mode (request) */
char fname[PROCFS_NAME_MAX]; /* procfs filename (request) */
};
void procfs_answer(unsigned int arg, int err);
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg);
void add_tid_entry(int osnum, int pid, int tid);
void add_pid_entry(int osnum, int pid);
void delete_tid_entry(int osnum, int pid, int tid);
void delete_pid_entry(int osnum, int pid);
void proc_exe_link(int osnum, int pid, const char *path);
void procfs_init(int osnum);
void procfs_exit(int osnum);
/* sysfs_files.c */
void setup_sysfs_files(ihk_os_t os);
void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
void get_vdso_info(ihk_os_t os, long vdso_pa);
struct get_cpu_mapping_req {
int busy; /* INOUT: */
int error; /* OUT: */
long buf_rpa; /* OUT: physical address of struct cpu_mapping */
int buf_elems; /* OUT: # of elements of buf */
int padding;
/* work for mcctrl */
wait_queue_head_t wq;
};
#endif

View File

@ -0,0 +1,838 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include <linux/resource.h>
#include <linux/interrupt.h>
#include "mcctrl.h"
#include <linux/version.h>
#include <linux/semaphore.h>
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
typedef uid_t kuid_t;
typedef gid_t kgid_t;
#endif
struct procfs_entry {
char *name;
mode_t mode;
const struct file_operations *fops;
};
#define NOD(NAME, MODE, FOP) { \
.name = (NAME), \
.mode = MODE, \
.fops = FOP, \
}
#define PROC_DIR(NAME, MODE) \
NOD(NAME, (S_IFDIR|(MODE)), NULL)
#define PROC_REG(NAME, MODE, fops) \
NOD(NAME, (S_IFREG|(MODE)), fops)
#define PROC_TERM \
NOD(NULL, 0, NULL)
static const struct procfs_entry tid_entry_stuff[];
static const struct procfs_entry pid_entry_stuff[];
static const struct procfs_entry base_entry_stuff[];
static const struct file_operations mckernel_forward_ro;
static const struct file_operations mckernel_forward;
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
static ssize_t mckernel_procfs_read(struct file *file, char __user *buf,
size_t nbytes, loff_t *ppos);
/* A private data for the procfs driver. */
struct procfs_list_entry;
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct procfs_list_entry *parent;
struct list_head children;
int osnum;
char *data;
char name[0];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
DEFINE_SEMAPHORE(procfs_file_list_lock);
static char *
getpath(struct procfs_list_entry *e, char *buf, int bufsize)
{
char *w = buf + bufsize - 1;
*w = '\0';
for(;;){
int l = strlen(e->name);
w -= l;
memcpy(w, e->name, l);
e = e->parent;
if(!e)
return w;
w--;
*w = '/';
}
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void
procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
static struct procfs_list_entry *
find_procfs_entry(struct procfs_list_entry *parent, const char *name)
{
struct list_head *list;
struct procfs_list_entry *e;
if(parent == NULL)
list = &procfs_file_list;
else
list = &parent->children;
list_for_each_entry(e, list, list) {
if(!strcmp(e->name, name))
return e;
}
return NULL;
}
static void
delete_procfs_entries(struct procfs_list_entry *top)
{
struct procfs_list_entry *e;
struct procfs_list_entry *n;
list_del(&top->list);
list_for_each_entry_safe(e, n, &top->children, list) {
delete_procfs_entries(e);
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
e->entry->read_proc = NULL;
e->entry->data = NULL;
#endif
remove_proc_entry(top->name, top->parent? top->parent->entry: NULL);
if(top->data)
kfree(top->data);
kfree(top);
}
static struct procfs_list_entry *
add_procfs_entry(struct procfs_list_entry *parent, const char *name, int mode,
kuid_t uid, kgid_t gid, const void *opaque)
{
struct procfs_list_entry *e = find_procfs_entry(parent, name);
struct proc_dir_entry *pde;
struct proc_dir_entry *parent_pde = NULL;
int f_mode = mode & 0777;
if(e)
delete_procfs_entries(e);
e = kmalloc(sizeof(struct procfs_list_entry) + strlen(name) + 1,
GFP_KERNEL);
if(!e){
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
memset(e, '\0', sizeof(struct procfs_list_entry));
INIT_LIST_HEAD(&e->children);
strcpy(e->name, name);
if(parent)
parent_pde = parent->entry;
if (mode & S_IFDIR) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = proc_mkdir(name, parent_pde);
#else
pde = proc_mkdir_data(name, f_mode, parent_pde, e);
#endif
}
else if ((mode & S_IFLNK) == S_IFLNK) {
pde = proc_symlink(name, parent_pde, (char *)opaque);
}
else {
const struct file_operations *fop;
if(opaque)
fop = (const struct file_operations *)opaque;
else if(mode & S_IWUSR)
fop = &mckernel_forward;
else
fop = &mckernel_forward_ro;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde = create_proc_entry(name, f_mode, parent_pde);
if(pde)
pde->proc_fops = fop;
#else
pde = proc_create_data(name, f_mode, parent_pde, fop, e);
if(pde)
proc_set_user(pde, uid, gid);
#endif
}
if(!pde){
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", name);
kfree(e);
return NULL;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
pde->uid = uid;
pde->gid = gid;
pde->data = e;
#endif
if(parent)
e->osnum = parent->osnum;
e->entry = pde;
e->parent = parent;
list_add(&(e->list), parent? &(parent->children): &procfs_file_list);
return e;
}
static void
add_procfs_entries(struct procfs_list_entry *parent,
const struct procfs_entry *entries, kuid_t uid, kgid_t gid)
{
const struct procfs_entry *p;
for(p = entries; p->name; p++){
add_procfs_entry(parent, p->name, p->mode, uid, gid, p->fops);
}
}
static const struct cred *
get_pid_cred(int pid)
{
struct task_struct *task = NULL;
if(pid > 0){
task = pid_task(find_vpid(pid), PIDTYPE_PID);
if(task){
return __task_cred(task);
}
}
return NULL;
}
static struct procfs_list_entry *
find_base_entry(int osnum)
{
char name[12];
sprintf(name, "mcos%d", osnum);
return find_procfs_entry(NULL, name);
}
static struct procfs_list_entry *
find_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_base_entry(osnum)))
return NULL;
sprintf(name, "%d", pid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
find_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
char name[12];
if(!(e = find_pid_entry(osnum, pid)))
return NULL;
if(!(e = find_procfs_entry(e, "task")))
return NULL;
sprintf(name, "%d", tid);
return find_procfs_entry(e, name);
}
static struct procfs_list_entry *
get_base_entry(int osnum)
{
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
e = find_procfs_entry(NULL, name);
if(!e){
e = add_procfs_entry(NULL, name, S_IFDIR | 0555,
uid, gid, NULL);
e->osnum = osnum;
}
return e;
}
static struct procfs_list_entry *
get_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static struct procfs_list_entry *
get_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *e;
char name[12];
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
sprintf(name, "mcos%d", osnum);
if(!(parent = find_procfs_entry(NULL, name)))
return NULL;
sprintf(name, "%d", pid);
if(!(parent = find_procfs_entry(parent, name)))
return NULL;
if(!(parent = find_procfs_entry(parent, "task")))
return NULL;
sprintf(name, "%d", tid);
e = find_procfs_entry(parent, name);
if(!e)
e = add_procfs_entry(parent, name, S_IFDIR | 0555,
uid, gid, NULL);
return e;
}
static void
_add_tid_entry(int osnum, int pid, int tid, const struct cred *cred)
{
struct procfs_list_entry *parent;
struct procfs_list_entry *exe;
parent = get_tid_entry(osnum, pid, tid);
if(parent){
add_procfs_entries(parent, tid_entry_stuff,
cred->uid, cred->gid);
exe = find_procfs_entry(parent->parent->parent, "exe");
if(exe){
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
cred->uid, cred->gid, exe->data);
}
}
}
void
add_tid_entry(int osnum, int pid, int tid)
{
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
_add_tid_entry(osnum, pid, tid, cred);
up(&procfs_file_list_lock);
}
void
add_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *parent;
const struct cred *cred = get_pid_cred(pid);
if(!cred)
return;
down(&procfs_file_list_lock);
parent = get_pid_entry(osnum, pid);
add_procfs_entries(parent, pid_entry_stuff, cred->uid, cred->gid);
_add_tid_entry(osnum, pid, pid, cred);
up(&procfs_file_list_lock);
}
void
delete_tid_entry(int osnum, int pid, int tid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_tid_entry(osnum, pid, tid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
delete_pid_entry(int osnum, int pid)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_pid_entry(osnum, pid);
if(e)
delete_procfs_entries(e);
up(&procfs_file_list_lock);
}
void
proc_exe_link(int osnum, int pid, const char *path)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = find_pid_entry(osnum, pid);
if(parent){
struct procfs_list_entry *task;
struct procfs_list_entry *e;
e = add_procfs_entry(parent, "exe", S_IFLNK | 0777, uid, gid,
path);
e->data = kmalloc(strlen(path) + 1, GFP_KERNEL);
strcpy(e->data, path);
task = find_procfs_entry(parent, "task");
list_for_each_entry(parent, &task->children, list) {
add_procfs_entry(parent, "exe", S_IFLNK | 0777,
uid, gid, path);
}
}
up(&procfs_file_list_lock);
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void
procfs_init(int osnum)
{
struct procfs_list_entry *parent;
kuid_t uid = KUIDT_INIT(0);
kgid_t gid = KGIDT_INIT(0);
down(&procfs_file_list_lock);
parent = get_base_entry(osnum);
add_procfs_entries(parent, base_entry_stuff, uid, gid);
up(&procfs_file_list_lock);
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void
procfs_exit(int osnum)
{
struct procfs_list_entry *e;
down(&procfs_file_list_lock);
e = find_base_entry(osnum);
if (e) {
delete_procfs_entries(e);
}
up(&procfs_file_list_lock);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
static ssize_t
mckernel_procfs_read(struct file *file, char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 0;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
if (copy_to_user(buf, kern_buffer, r->ret)) {
kprintf("ERROR: mckernel_procfs_read: copy_to_user failed.\n");
ret = -EFAULT;
goto out;
}
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static ssize_t
mckernel_procfs_write(struct file *file, const char __user *buf, size_t nbytes,
loff_t *ppos)
{
struct inode * inode = file->f_inode;
char *kern_buffer = NULL;
int order = 0;
volatile struct procfs_read *r = NULL;
struct ikc_scd_packet isp;
int ret;
unsigned long pbuf;
unsigned long count = nbytes;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0)
struct proc_dir_entry *dp = PDE(inode);
struct procfs_list_entry *e = dp->data;
#else
struct procfs_list_entry *e = PDE_DATA(inode);
#endif
loff_t offset = *ppos;
char pathbuf[PROCFS_NAME_MAX];
char *path;
path = getpath(e, pathbuf, 256);
dprintk("mckernel_procfs_read: invoked for %s, offset: %lu, count: %d\n",
path, offset, count);
if (count <= 0 || offset < 0) {
return 0;
}
while ((1 << order) < count) ++order;
if (order > 12) {
order -= 12;
}
else {
order = 1;
}
/* NOTE: we need physically contigous memory to pass through IKC */
kern_buffer = (char *)__get_free_pages(GFP_KERNEL, order);
if (!kern_buffer) {
printk("mckernel_procfs_read(): ERROR: allocating kernel buffer\n");
return -ENOMEM;
}
if (copy_from_user(kern_buffer, buf, nbytes)) {
ret = -EFAULT;
goto out;
}
pbuf = virt_to_phys(kern_buffer);
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
ret = -ENOMEM;
goto out;
}
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
r->readwrite = 1;
strncpy((char *)r->fname, path, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = 0;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(osnum_to_os(e->osnum), 0, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if (r->ret > 0) {
*ppos += r->ret;
}
ret = r->ret;
out:
if(kern_buffer)
free_pages((uintptr_t)kern_buffer, order);
if(r)
kfree((void *)r);
return ret;
}
static loff_t
mckernel_procfs_lseek(struct file *file, loff_t offset, int orig)
{
switch (orig) {
case 0:
file->f_pos = offset;
break;
case 1:
file->f_pos += offset;
break;
default:
return -EINVAL;
}
return file->f_pos;
}
struct procfs_work {
void *os;
int msg;
int pid;
unsigned long arg;
struct work_struct work;
};
static void procfsm_work_main(struct work_struct *work0)
{
struct procfs_work *work = container_of(work0, struct procfs_work, work);
switch (work->msg) {
case SCD_MSG_PROCFS_TID_CREATE:
add_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
case SCD_MSG_PROCFS_TID_DELETE:
delete_tid_entry(ihk_host_os_get_index(work->os), work->pid, work->arg);
break;
default:
printk("%s: unknown work: msg: %d, pid: %d, arg: %lu)\n",
__FUNCTION__, work->msg, work->pid, work->arg);
break;
}
kfree(work);
return;
}
int procfsm_packet_handler(void *os, int msg, int pid, unsigned long arg)
{
struct procfs_work *work = NULL;
work = kzalloc(sizeof(*work), GFP_ATOMIC);
if (!work) {
printk("%s: kzalloc failed\n", __FUNCTION__);
return -1;
}
work->os = os;
work->msg = msg;
work->pid = pid;
work->arg = arg;
INIT_WORK(&work->work, &procfsm_work_main);
schedule_work(&work->work);
return 0;
}
static const struct file_operations mckernel_forward_ro = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = NULL,
};
static const struct file_operations mckernel_forward = {
.llseek = mckernel_procfs_lseek,
.read = mckernel_procfs_read,
.write = mckernel_procfs_write,
};
static const struct procfs_entry tid_entry_stuff[] = {
// PROC_REG("auxv", S_IRUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
// PROC_REG("pagemap", S_IRUGO, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
// PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),
// PROC_REG("coredump_filter", S_IRUGO|S_IWUSR, NULL),
PROC_REG("cpuset", S_IXUSR, NULL),
// PROC_REG("environ", S_IRUSR, NULL),
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", S_IRUGO, NULL),
PROC_REG("mem", S_IRUSR|S_IWUSR, NULL),
PROC_REG("pagemap", S_IRUGO, NULL),
PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", S_IRUGO, NULL),
// PROC_REG("syscall", S_IRUGO, NULL),
PROC_DIR("task", S_IRUGO|S_IXUGO),
// PROC_REG("wchan", S_IRUGO, NULL),
PROC_TERM
};
static const struct procfs_entry base_entry_stuff[] = {
// PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("cpuinfo", S_IRUGO, NULL),
// PROC_REG("meminfo", S_IRUGO, NULL),
// PROC_REG("pagetypeinfo",S_IRUGO, NULL),
// PROC_REG("softirq", S_IRUGO, NULL),
PROC_REG("stat", S_IRUGO, NULL),
// PROC_REG("uptime", S_IRUGO, NULL),
// PROC_REG("version", S_IRUGO, NULL),
// PROC_REG("vmallocinfo",S_IRUSR, NULL),
// PROC_REG("vmstat", S_IRUGO, NULL),
// PROC_REG("zoneinfo", S_IRUGO, NULL),
PROC_TERM
};

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,73 @@
/**
* \file sysfs.h
* License details are found in the file LICENSE.
* \brief
* sysfs framework API definitions
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2016 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCCTRL_SYSFS_H
#define MCCTRL_SYSFS_H
#define SYSFS_PATH_MAX 1024
/* for sysfs_unlinkf() */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfsm_ops {
ssize_t (*show)(struct sysfsm_ops *ops, void *instance, void *buf,
size_t bufsize);
ssize_t (*store)(struct sysfsm_ops *ops, void *instance,
const void *buf, size_t bufsize);
void (*release)(struct sysfsm_ops *ops, void *instance);
};
struct sysfs_handle {
long handle;
};
typedef struct sysfs_handle sysfs_handle_t;
struct sysfsm_bitmap_param {
int nbits;
int padding;
void *ptr;
};
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
static inline int is_special_sysfs_ops(void *ops)
{
return (((long)SYSFS_SPECIAL_OPS_MIN <= (long)ops)
&& ((long)ops <= (long)SYSFS_SPECIAL_OPS_MAX));
}
extern int sysfsm_createf(ihk_os_t os, struct sysfsm_ops *ops, void *instance,
int mode, const char *fmt, ...);
extern int sysfsm_mkdirf(ihk_os_t os, sysfs_handle_t *dirhp,
const char *fmt, ...);
extern int sysfsm_symlinkf(ihk_os_t os, sysfs_handle_t targeth,
const char *fmt, ...);
extern int sysfsm_lookupf(ihk_os_t os, sysfs_handle_t *objhp,
const char *fmt, ...);
extern int sysfsm_unlinkf(ihk_os_t os, int flags, const char *fmt, ...);
extern void sysfsm_cleanup(ihk_os_t os);
extern void sysfsm_packet_handler(void *os, int msg, int err, long arg1,
long arg2);
#endif /* MCCTRL_SYSFS_H */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,88 @@
/**
* \file sysfs_msg.h
* License details are found in the file LICENSE.
* \brief
* message declarations for sysfs framework
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2015 RIKEN AICS
*/
/*
* HISTORY:
*/
#ifndef MCKERNEL_SYSFS_MSG_H
#define MCKERNEL_SYSFS_MSG_H
#define SYSFS_PATH_MAX 1024
struct sysfs_req_create_param {
int mode;
int error;
long client_ops;
long client_instance;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_create_param */
#define SYSFS_SPECIAL_OPS_MIN ((void *)1)
#define SYSFS_SPECIAL_OPS_MAX ((void *)1000)
#define SYSFS_SNOOPING_OPS_d32 ((void *)1)
#define SYSFS_SNOOPING_OPS_d64 ((void *)2)
#define SYSFS_SNOOPING_OPS_u32 ((void *)3)
#define SYSFS_SNOOPING_OPS_u64 ((void *)4)
#define SYSFS_SNOOPING_OPS_s ((void *)5)
#define SYSFS_SNOOPING_OPS_pbl ((void *)6)
#define SYSFS_SNOOPING_OPS_pb ((void *)7)
#define SYSFS_SNOOPING_OPS_u32K ((void *)8)
struct sysfs_req_mkdir_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_mkdir_param */
struct sysfs_req_symlink_param {
int error;
int padding;
long target;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_symlink_param */
struct sysfs_req_lookup_param {
int error;
int padding;
long handle;
char path[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_lookup_param */
/* for sysfs_req_unlink_param.flags */
#define SYSFS_UNLINK_KEEP_ANCESTOR 0x01
struct sysfs_req_unlink_param {
int flags;
int error;
char path[SYSFS_PATH_MAX];
int padding;
int busy;
}; /* struct sysfs_req_unlink_param */
struct sysfs_req_setup_param {
int error;
int padding;
long buf_rpa;
long bufsize;
char padding3[SYSFS_PATH_MAX];
int padding2;
int busy;
}; /* struct sysfs_req_setup_param */
#endif /* MCKERNEL_SYSFS_MSG_H */

View File

@ -0,0 +1,44 @@
ENABLE_MCOVERLAYFS=@ENABLE_MCOVERLAYFS@
RELEASE=$(shell uname -r)
MAJOR=$(shell echo ${RELEASE} | sed -e 's/^\([0-9]*\).*/\1/')
MINOR=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.\([0-9]*\).*/\1/')
PATCH=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.\([0-9]*\).*/\1/')
LINUX_VERSION_CODE=$(shell expr \( ${MAJOR} \* 65536 \) + \( ${MINOR} \* 256 \) + ${PATCH})
RHEL_RELEASE_TMP=$(shell echo ${RELEASE} | sed -e 's/^[0-9]*.[0-9]*.[0-9]*-\([0-9]*\).*/\1/')
RHEL_RELEASE=$(shell if [ "${RELEASE}" == "${RHEL_RELEASE_TMP}" ]; then echo ""; else echo ${RHEL_RELEASE_TMP}; fi)
BUILD_MODULE_TMP=$(shell if [ "${RHEL_RELEASE}" == "" ]; then echo "org"; else echo "rhel"; fi)
BUILD_MODULE=none
ifeq ($(ENABLE_MCOVERLAYFS),yes)
ifeq ($(BUILD_MODULE_TMP),org)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -eq 199168 -a ${RHEL_RELEASE} -eq 327 ]; then echo "linux-3.10.0-327.36.1.el7"; else echo "none"; fi)
endif
endif
endif
.PHONY: clean install modules
modules:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
endif
clean:
@(cd linux-3.10.0-327.36.1.el7; make clean)
@(cd linux-4.0.9; make clean)
@(cd linux-4.6.7; make clean)
install:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make install)
endif

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,461 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,972 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -0,0 +1,442 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,588 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_readdir(od->realfile, filler, buf);
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,416 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,951 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,438 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,200 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,557 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,21 @@
KDIR ?= @KDIR@
ARCH ?= @ARCH@
KMODDIR = @KMODDIR@
src = @abs_srcdir@
obj-m += mcoverlay.o
mcoverlay-y := copy_up.o dir.o inode.o readdir.o super.o
.PHONY: clean install modules
modules:
$(MAKE) -C $(KDIR) M=$(PWD) SUBDIRS=$(PWD) ARCH=$(ARCH) modules
clean:
$(RM) .*.cmd *.mod.c *.o *.ko* Module.symvers modules.order -r .tmp*
install:
mkdir -p -m 755 $(KMODDIR)
install -m 644 mcoverlay.ko $(KMODDIR)

View File

@ -0,0 +1,460 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static bool __read_mostly ovl_check_copy_up;
module_param_named(check_copy_up, ovl_check_copy_up, bool,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
if (f->f_inode == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
if (OVL_OPT_NOFSCHECK(opt)) {
OVL_DEBUG("fail: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
continue;
} else {
error = size;
break;
}
}
OVL_DEBUG("success: old=%pd4, i_ino=%lu, name=%s\n",
old, old->d_inode->i_ino, name);
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
unsigned opt = ovl_get_config_opt(dentry);
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry, opt);
if (err)
goto out_cleanup;
inode_lock(newdentry->d_inode);
err = ovl_set_attr(newdentry, stat);
inode_unlock(newdentry->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out2;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
/* Raced with another copy-up? Nothing to do, then... */
err = 0;
goto out_unlock;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
dput(parent);
dput(next);
}
return err;
}

View File

@ -0,0 +1,969 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
inode_lock_nested(udir, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
inode_unlock(udir);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
unsigned opt = ovl_get_config_opt(dentry);
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir, opt);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
inode_lock(opaquedir->d_inode);
err = ovl_set_attr(opaquedir, &stat);
inode_unlock(opaquedir->d_inode);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
inode_lock_nested(dir, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
inode_unlock(dir);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
/*
* Old dentry now lives in different location. Dentries in
* lowerstack are stale. We cannot drop them here because
* access to them is lockless. This could be only pure upper
* or opaque directory - numlower is zero. Or upper non-dir
* entry - its pureness is tracked by flag opaque.
*/
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -0,0 +1,494 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_truncate(struct dentry *dentry)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
/*
* Check for permissions before trying to copy-up. This is redundant
* since it will be rechecked later by ->setattr() on upper dentry. But
* without this, copy-up can be triggered by just about anybody.
*
* We don't initialize inode->size, which just means that
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
err = inode_change_ok(dentry->d_inode, attr);
if (err)
return err;
err = ovl_want_write(dentry);
if (err)
goto out;
if (attr->ia_valid & ATTR_SIZE) {
struct inode *realinode = d_inode(ovl_dentry_real(dentry));
err = -ETXTBSY;
if (atomic_read(&realinode->i_writecount) < 0)
goto out_drop_write;
}
err = ovl_copy_up(dentry);
if (!err) {
struct inode *winode = NULL;
upperdentry = ovl_dentry_upper(dentry);
if (attr->ia_valid & ATTR_SIZE) {
winode = d_inode(upperdentry);
err = get_write_access(winode);
if (err)
goto out_drop_write;
}
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
err = notify_change(upperdentry, attr, NULL);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
if (winode)
put_write_access(winode);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
ovl_reset_ovl_entry(&oe, alias);
}
realdentry = ovl_entry_real(oe, &is_upper);
if (ovl_is_default_permissions(inode)) {
struct kstat stat;
struct path realpath = { .dentry = realdentry };
if (mask & MAY_NOT_BLOCK)
return -ECHILD;
realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
err = vfs_getattr(&realpath, &stat);
if (err)
goto out_dput;
err = -ESTALE;
if ((stat.mode ^ inode->i_mode) & S_IFMT)
goto out_dput;
inode->i_mode = stat.mode;
inode->i_uid = stat.uid;
inode->i_gid = stat.gid;
err = generic_permission(inode, mask);
goto out_dput;
}
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
struct dentry *realdentry;
struct inode *realinode;
if (!dentry)
return ERR_PTR(-ECHILD);
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->get_link))
return ERR_PTR(-EPERM);
return realinode->i_op->get_link(realdentry, realinode, done);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
unsigned opt = ovl_get_config_opt(dentry);
if (OVL_OPT_NOCOPYUPW(opt)) {
return 0;
}
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags)
{
int err;
struct path realpath;
enum ovl_path_type type;
unsigned opt = ovl_get_config_opt(dentry);
if (d_is_dir(dentry))
return d_backing_inode(dentry);
type = ovl_path_real(dentry, &realpath);
if (!OVL_OPT_NOCOPYUPW(opt) &&
ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
OVL_DEBUG("copyup: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
err = ovl_want_write(dentry);
if (err)
return ERR_PTR(err);
if (file_flags & O_TRUNC)
err = ovl_copy_up_truncate(dentry);
else
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
if (err)
return ERR_PTR(err);
ovl_path_upper(dentry, &realpath);
}
if (realpath.dentry->d_flags & DCACHE_OP_SELECT_INODE)
return realpath.dentry->d_op->d_select_inode(realpath.dentry, file_flags);
if (OVL_OPT_NOFSCHECK(opt)) {
if (realpath.dentry->d_inode->i_sb->s_magic == SYSFS_MAGIC) {
OVL_DEBUG("sysfs: dentry=%pd4, i_ino=%lu\n",
dentry, dentry->d_inode->i_ino);
OVL_DEBUG("sysfs: realpath.dentry=%pd4, i_ino=%lu\n",
realpath.dentry, realpath.dentry->d_inode->i_ino);
if (!dentry->d_inode->i_private) {
dentry->d_inode->i_private = dentry->d_fsdata;
dentry->d_fsdata = realpath.dentry->d_fsdata;
}
}
}
return d_backing_inode(realpath.dentry);
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
mode &= S_IFMT;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -0,0 +1,222 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
//#define DEBUG
#ifdef DEBUG
#define OVL_DEBUG(format, ...) pr_err("[DEBUG] %s(): " format, __FUNCTION__, ##__VA_ARGS__)
#else
#define OVL_DEBUG(format, ...) {}
#endif
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
enum ovl_opt_bit {
__OVL_OPT_DEFAULT = 0,
__OVL_OPT_NOCOPYUPW = (1 << 0),
__OVL_OPT_NOFSCHECK = (1 << 1),
};
#define OVL_OPT_NOCOPYUPW(opt) ((opt) & __OVL_OPT_NOCOPYUPW)
#define OVL_OPT_NOFSCHECK(opt) ((opt) & __OVL_OPT_NOFSCHECK)
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
unsigned ovl_get_config_opt(struct dentry *dentry);
void ovl_reset_ovl_entry(struct ovl_entry **oe, struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
bool is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
bool ovl_is_default_permissions(struct inode *inode);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
int ovl_check_d_type_supported(struct path *realpath);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
to->i_mode = from->i_mode;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat);
int ovl_copy_xattr(struct dentry *old, struct dentry *new, unsigned opt);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -0,0 +1,616 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_lowest;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
bool d_type_supported;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lowest(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_lowest)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
inode_unlock(dir->d_inode);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_lowest = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_lowest = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
inode_lock(file_inode(file));
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
inode_unlock(file_inode(file));
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
inode_lock(inode);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
inode_unlock(inode);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
inode_unlock(inode);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
inode_lock(inode);
ovl_cache_put(od, file->f_path.dentry);
inode_unlock(inode);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
if (dentry->d_inode)
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
inode_unlock(upper->d_inode);
}
static int ovl_check_d_type(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
/* Even if d_type is not supported, DT_DIR is returned for . and .. */
if (!strncmp(name, ".", namelen) || !strncmp(name, "..", namelen))
return 0;
if (d_type != DT_UNKNOWN)
rdd->d_type_supported = true;
return 0;
}
/*
* Returns 1 if d_type is supported, 0 not supported/unknown. Negative values
* if error is encountered.
*/
int ovl_check_d_type_supported(struct path *realpath)
{
int err;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_check_d_type,
.d_type_supported = false,
};
err = ovl_dir_read(realpath, &rdd);
if (err)
return err;
return rdd.d_type_supported;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,411 +0,0 @@
/**
* \file procfs.c
* License details are found in the file LICENSE.
* \brief
* mcctrl procfs
* \author Naoki Hamada <nao@axe.bz> \par
* Copyright (C) 2014 AXE, Inc.
*/
/*
* HISTORY:
*/
#include <linux/string.h>
#include <linux/proc_fs.h>
#include <linux/list.h>
#include <linux/uaccess.h>
#include <linux/fs.h>
#include "mcctrl.h"
//#define PROCFS_DEBUG
#ifdef PROCFS_DEBUG
#define dprintk(...) printk(__VA_ARGS__)
#else
#define dprintk(...)
#endif
static DECLARE_WAIT_QUEUE_HEAD(procfsq);
int mckernel_procfs_read(char *buffer, char **start, off_t offset,
int count, int *peof, void *dat);
/* A private data for the procfs driver. */
struct procfs_list_entry {
struct list_head list;
struct proc_dir_entry *entry;
struct proc_dir_entry *parent;
ihk_os_t os;
int osnum;
int pid;
int cpu;
char fname[PROCFS_NAME_MAX];
};
/*
* In the procfs_file_list, mckenrel procfs files are
* listed in the manner that the leaf file is located
* always nearer to the list top than its parent node
* file.
*/
LIST_HEAD(procfs_file_list);
static ihk_spinlock_t procfs_file_list_lock;
/**
* \brief Return specified procfs entry.
*
* \param p a name of the procfs file
* \param osnum os number
* \param mode if zero create a directory otherwise a file
*
* return value: NULL: Something wrong has occurred.
* otherwise: address of the proc_dir_entry structure of the procfs file
*
* p should not be NULL nor terminated by "/".
*
* We create a procfs entry if there is not already one.
* This process is recursive to the root of the procfs tree.
*/
/*
* XXX: Two or more entries which have same name can be created.
*
* get_procfs_entry() avoids creating an entry which has already been created.
* But, it allows creating an entry which is being created by another thread.
*
* This problem occurred when two requests which created files with a common
* ancestor directory which was not explicitly created were racing.
*/
static struct proc_dir_entry *get_procfs_entry(char *p, int osnum, int mode)
{
char *r;
struct proc_dir_entry *ret = NULL, *parent = NULL;
struct procfs_list_entry *e;
char name[PROCFS_NAME_MAX];
unsigned long irqflags;
dprintk("get_procfs_entry: %s for osnum %d mode %o\n", p, osnum, mode);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if (e == NULL) {
kprintf("ERROR: The procfs_file_list has a null entry.\n");
return NULL;
}
if (strncmp(e->fname, p, PROCFS_NAME_MAX) == 0) {
/* We found the entry */
ret = e->entry;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
if (ret != NULL) {
return ret;
}
r = strrchr(p, '/');
if (r != NULL) {
/* We have non-null parent dir. */
strncpy(name, p, r - p);
name[r - p] = '\0';
parent = get_procfs_entry(name, osnum, 0);
if (parent == NULL) {
/* We counld not get a parent procfs entry. Give up.*/
return NULL;
}
}
e = kmalloc(sizeof(struct procfs_list_entry), GFP_KERNEL);
if (e == NULL) {
kprintf("ERROR: not enough memory to create PROCFS entry.\n");
return NULL;
}
/* Fill the fname field of the entry */
strncpy(e->fname, p, PROCFS_NAME_MAX);
if (r != NULL) {
strncpy(name, r + 1, p + PROCFS_NAME_MAX - r - 1);
} else {
strncpy(name, p, PROCFS_NAME_MAX);
}
if (mode == 0) {
ret = proc_mkdir(name, parent);
} else {
ret = create_proc_entry(name, mode, parent);
}
if (ret == NULL) {
kprintf("ERROR: cannot create a PROCFS entry for %s.\n", p);
kfree(e);
return NULL;
}
ret->data = e;
e->osnum = osnum;
e->entry = ret;
e->parent = parent;
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_add(&(e->list), &procfs_file_list);
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
dprintk("get_procfs_entry: %s done\n", p);
return ret;
}
/**
* \brief Create a procfs entry.
*
* \param __os (opeque) os variable
* \param ref cpuid of the requesting mckernel process
* \param osnum osnum of the requesting mckernel process
* \param pid pid of the requesting mckernel process
* \param arg sent argument
*/
void procfs_create(void *__os, int ref, int osnum, int pid, unsigned long arg)
{
struct proc_dir_entry *entry;
struct procfs_list_entry *e;
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
int mode;
char name[PROCFS_NAME_MAX];
dprintk("procfs_create: osnum: %d, cpu: %d, pid: %d\n", osnum, ref, pid);
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("name: %s mode: %o\n", f->fname, f->mode);
strncpy(name, f->fname, PROCFS_NAME_MAX);
mode = f->mode;
if (name[PROCFS_NAME_MAX - 1] != '\0') {
printk("ERROR: procfs_creat: file name not properly terminated.\n");
goto quit;
}
entry = get_procfs_entry(name, osnum, mode);
if (entry == NULL) {
printk("ERROR: could not create a procfs entry for %s.\n", name);
goto quit;
}
e = entry->data;
e->os = __os;
e->cpu = ref;
e->pid = pid;
entry->read_proc = mckernel_procfs_read;
quit:
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_create: done\n");
}
/**
* \brief Delete a procfs entry.
*
* \param __os (opaque) os variable
* \param osnum os number
* \param arg sent argument
*/
void procfs_delete(void *__os, int osnum, unsigned long arg)
{
ihk_device_t dev = ihk_os_to_dev(__os);
unsigned long parg;
struct procfs_file *f;
struct procfs_list_entry *e;
struct proc_dir_entry *parent = NULL;
char name[PROCFS_NAME_MAX];
char *r;
unsigned long irqflags;
dprintk("procfs_delete: \n");
parg = ihk_device_map_memory(dev, arg, sizeof(struct procfs_file));
f = ihk_device_map_virtual(dev, parg, sizeof(struct procfs_file), NULL, 0);
dprintk("fname: %s.\n", f->fname);
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry(e, &procfs_file_list, list) {
if ((strncmp(e->fname, f->fname, PROCFS_NAME_MAX) == 0) &&
(e->osnum == osnum)) {
list_del(&e->list);
e->entry->read_proc = NULL;
e->entry->data = NULL;
parent = e->parent;
kfree(e);
r = strrchr(f->fname, '/');
if (r == NULL) {
strncpy(name, f->fname, PROCFS_NAME_MAX);
} else {
strncpy(name, r + 1, PROCFS_NAME_MAX);
}
dprintk("found and remove %s from the list.\n", name);
remove_proc_entry(name, parent);
break;
}
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
f->status = 1; /* Now the peer can free the data. */
ihk_device_unmap_virtual(dev, f, sizeof(struct procfs_file));
ihk_device_unmap_memory(dev, parg, sizeof(struct procfs_file));
dprintk("procfs_delete: done\n");
}
/**
* \brief Process SCD_MSG_PROCFS_ANSWER message.
*
* \param arg sent argument
* \param err error info (redundant)
*/
void procfs_answer(unsigned int arg, int err)
{
dprintk("procfs: received SCD_MSG_PROCFS_ANSWER message(err = %d).\n", err);
wake_up_interruptible(&procfsq);
}
/**
* \brief The callback funciton for McKernel procfs
*
* This function conforms to the 2) way of fs/proc/generic.c
* from linux-2.6.39.4.
*/
int mckernel_procfs_read(char *buffer, char **start, off_t offset,
int count, int *peof, void *dat)
{
struct procfs_list_entry *e = dat;
volatile struct procfs_read *r;
struct ikc_scd_packet isp;
int ret, retrycount = 0;
unsigned long pbuf;
dprintk("mckernel_procfs_read: invoked for %s\n", e->fname);
if (count <= 0 || dat == NULL || offset < 0) {
return 0;
}
pbuf = virt_to_phys(buffer);
if (pbuf / PAGE_SIZE != (pbuf + count - 1) / PAGE_SIZE) {
/* Truncate the read count upto the nearest page boundary */
count = ((pbuf + count - 1) / PAGE_SIZE) * PAGE_SIZE - pbuf;
}
r = kmalloc(sizeof(struct procfs_read), GFP_KERNEL);
if (r == NULL) {
return -ENOMEM;
}
retry:
dprintk("offset: %lx, count: %d, cpu: %d\n", offset, count, e->cpu);
r->pbuf = pbuf;
r->eof = 0;
r->ret = -EIO; /* default */
r->status = 0;
r->offset = offset;
r->count = count;
strncpy((char *)r->fname, e->fname, PROCFS_NAME_MAX);
isp.msg = SCD_MSG_PROCFS_REQUEST;
isp.ref = e->cpu;
isp.arg = virt_to_phys(r);
ret = mcctrl_ikc_send(e->os, e->cpu, &isp);
if (ret < 0) {
goto out; /* error */
}
/* Wait for a reply. */
ret = -EIO; /* default exit code */
dprintk("now wait for a relpy\n");
/* Wait for the status field of the procfs_read structure set ready. */
if (wait_event_interruptible_timeout(procfsq, r->status != 0, HZ) == 0) {
kprintf("ERROR: mckernel_procfs_read: timeout (1 sec).\n");
goto out;
}
/* Wake up and check the result. */
dprintk("mckernel_procfs_read: woke up. ret: %d, eof: %d\n", r->ret, r->eof);
if ((r->ret == 0) && (r->eof != 1)) {
/* A miss-hit caused by migration has occurred.
* We simply retry the query with a new CPU.
*/
if (retrycount++ > 10) {
kprintf("ERROR: mckernel_procfs_read: excessive retry.\n");
goto out;
}
e->cpu = r->newcpu;
dprintk("retry\n");
goto retry;
}
if (r->eof == 1) {
dprintk("reached end of file.\n");
*peof = 1;
}
*start = buffer;
ret = r->ret;
out:
kfree((void *)r);
return ret;
}
/**
* \brief Initialization for procfs
*
* \param osnum os number
*/
void procfs_init(int osnum) {
}
/**
* \brief Finalization for procfs
*
* \param osnum os number
*/
void procfs_exit(int osnum) {
char buf[20], *r;
int error;
mm_segment_t old_fs = get_fs();
struct kstat stat;
struct proc_dir_entry *parent;
struct procfs_list_entry *e, *temp = NULL;
unsigned long irqflags;
dprintk("remove remaining mckernel procfs files.\n");
irqflags = ihk_ikc_spinlock_lock(&procfs_file_list_lock);
list_for_each_entry_safe(e, temp, &procfs_file_list, list) {
if (e->osnum == osnum) {
dprintk("found entry for %s.\n", e->fname);
list_del(&e->list);
e->entry->read_proc = NULL;
e->entry->data = NULL;
parent = e->parent;
r = strrchr(e->fname, '/');
if (r == NULL) {
r = e->fname;
} else {
r += 1;
}
remove_proc_entry(r, parent);
dprintk("free the entry\n");
kfree(e);
}
dprintk("iterate it.\n");
}
ihk_ikc_spinlock_unlock(&procfs_file_list_lock, irqflags);
sprintf(buf, "/proc/mcos%d", osnum);
set_fs(KERNEL_DS);
error = vfs_stat (buf, &stat);
set_fs(old_fs);
if (error != 0) {
return;
}
printk("procfs_exit: We have to remove unexpectedly remaining %s.\n", buf);
/* remove remnant of previous mcos%d */
remove_proc_entry(buf + 6, NULL);
}

View File

@ -1,13 +1,19 @@
CC=@CC@
BINDIR=@BINDIR@
CFLAGS=-Wall -O -fPIE -pie
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I.
VPATH=@abs_srcdir@
TARGET=mcexec
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
all: $(TARGET)
mcexec: mcexec.c
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) -pthread -o $@ $^ $(EXTRA_OBJS)
$(CC) -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -lrt -pthread -o $@ $^ $(EXTRA_OBJS)
eclair: eclair.c
$(CC) $(CFLAGS) -o $@ $^ $(LIBS)
clean:
$(RM) $(TARGET) *.o
@ -17,4 +23,5 @@ clean:
install:
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
@uncomment_if_ENABLE_MEMDUMP@install -m 755 eclair $(BINDIR)

1070
executer/user/eclair.c Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,7 +3,7 @@ OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__
CFLAGS += -DKNC_MAP_MICPA $(EXTRA_CFLAGS)
ifeq ("$(DCFA_MODE)", "kmod")

View File

@ -3,15 +3,15 @@ SRC=$(VPATH)
IHKDIR=$(IHKBASE)/$(TARGETDIR)
OBJS = init.o mem.o debug.o mikc.o listeners.o ap.o syscall.o cls.o host.o
OBJS += process.o copy.o waitq.o futex.o timer.o plist.o fileobj.o shmobj.o
OBJS += zeroobj.o procfs.o devobj.o
OBJS += zeroobj.o procfs.o devobj.o sysfs.o
DEPSRCS=$(wildcard $(SRC)/*.c)
CFLAGS += -I$(SRC)/include -mcmodel=kernel -D__KERNEL__
CFLAGS += -I$(SRC)/include -D__KERNEL__ -g -fno-omit-frame-pointer -fno-inline -fno-inline-small-functions
LDFLAGS += -e arch_start
IHKOBJ = ihk/ihk.o
include $(SRC)/config/config.$(TARGET)
include $(IHKBASE)/Makefile.common
include @abs_builddir@/../../ihk/cokernel/Makefile.common
# CFLAGS += -I$(SRC)/../arch/$(IHKARCH)/kernel/include -I$(SRC)/../lib/include

View File

@ -9,7 +9,7 @@ V ?= $(VERBOSE)
KERNEL = kernel.img
KERNELS = $(addsuffix /$(KERNEL),$(addprefix $(O)/,$(BUILD_TARGET)))
SUBCMD_OPTS = V='$(V)'
SUBCMD_OPTS = V='$(V)' BUILD_IHK_COKERNEL=@abs_builddir@/../../ihk/cokernel
$(if $(O),,$(error Specify the compilation target directory))
#$(if $(shell ls $(IHKBASE)/Makefile),,\

View File

@ -24,20 +24,23 @@
#include <process.h>
#include <init.h>
#include <march.h>
#include <cls.h>
int num_processors = 1;
static volatile int ap_stop = 1;
static void ap_wait(void)
{
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
init_tick();
while (ap_stop) {
barrier();
cpu_pause();
}
sync_tick();
kmalloc_init();
sched_init();
arch_start_pvclock();
if (find_command_line("hidos")) {
init_host_syscall_channel();
@ -53,19 +56,20 @@ static void ap_wait(void)
void ap_start(void)
{
init_tick();
ap_stop = 0;
sync_tick();
}
void ap_init(void)
{
struct ihk_mc_cpu_info *cpu_info;
int i;
int bsp_hw_id;
int bsp_hw_id, bsp_cpu_id;
ihk_mc_init_ap();
init_delay();
wrmsr(MSR_IA32_TIME_STAMP_COUNTER, 0);
cpu_info = ihk_mc_get_cpu_info();
bsp_hw_id = ihk_mc_get_hardware_processor_id();
@ -74,18 +78,164 @@ void ap_init(void)
return;
}
kprintf("BSP HW ID = %d, ", bsp_hw_id);
kprintf("AP Booting :");
bsp_cpu_id = 0;
for (i = 0; i < cpu_info->ncpus; ++i) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
bsp_cpu_id = i;
break;
}
}
kprintf("BSP: %d (HW ID: %d @ NUMA %d)\n", bsp_cpu_id,
bsp_hw_id, cpu_info->nodes[0]);
for (i = 0; i < cpu_info->ncpus; i++) {
if (cpu_info->hw_ids[i] == bsp_hw_id) {
continue;
}
kprintf("AP Booting: %d (HW ID: %d @ NUMA %d)\n", i,
cpu_info->hw_ids[i], cpu_info->nodes[i]);
ihk_mc_boot_cpu(cpu_info->hw_ids[i], (unsigned long)ap_wait);
kprintf(" %d", cpu_info->hw_ids[i]);
num_processors++;
}
kprintf(" .. Done\n");
kprintf("AP Booting: Done\n");
}
#include <sysfs.h>
#include <kmalloc.h>
#include <string.h>
#include <vsprintf.h>
static ssize_t
show_int(struct sysfs_ops *ops, void *instance, void *buf, size_t size)
{
int *p = instance;
return snprintf(buf, size, "%d\n", *p);
}/* show_int() */
struct sysfs_ops show_int_ops = {
.show = &show_int,
};
struct fake_cpu_info {
int online;
};
static struct fake_cpu_info *fake_cpu_infos = NULL;
enum fake_cpu_info_member {
ONLINE,
};
struct fake_cpu_info_ops {
enum fake_cpu_info_member member;
struct sysfs_ops ops;
};
static ssize_t
show_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
n = snprintf(buf, size, "%d\n", info->online);
break;
default:
n = -EINVAL;
break;
}
if (n >= size) {
n = -ENOSPC;
}
return n;
} /* show_fake_cpu_info() */
static ssize_t
store_fake_cpu_info(struct sysfs_ops *ops0, void *instance, void *buf,
size_t size)
{
struct fake_cpu_info_ops *ops
= container_of(ops0, struct fake_cpu_info_ops, ops);
struct fake_cpu_info *info = instance;
ssize_t n;
switch (ops->member) {
case ONLINE:
kprintf("NYI:store_fake_cpu_info(%p,%p,%p,%ld): "
"online %d --> \"%.*s\"\n",
ops0, instance, buf, size, info->online,
(int)size, buf);
n = size;
break;
default:
n = -EIO;
break;
}
return n;
} /* store_fake_cpu_info() */
static struct fake_cpu_info_ops show_fci_online = {
.member = ONLINE,
.ops.show = &show_fake_cpu_info,
.ops.store = &store_fake_cpu_info,
};
void
cpu_sysfs_setup(void)
{
int error;
int cpu;
sysfs_handle_t targeth;
struct fake_cpu_info *info;
/* sample of simple variable **********************************/
error = sysfs_createf(&show_int_ops, &num_processors, 0444,
"/sys/devices/system/cpu/num_processors");
if (error) {
panic("cpu_sysfs_setup:sysfs_createf(num_processors) failed\n");
}
/* sample of more complex variable ****************************/
/* setup table */
info = kmalloc(sizeof(*info) * num_processors, IHK_MC_AP_CRITICAL);
for (cpu = 0; cpu < num_processors; ++cpu) {
info[cpu].online = 1;
}
fake_cpu_infos = info;
/* setup sysfs tree */
for (cpu = 0; cpu < num_processors; ++cpu) {
/* online */
error = sysfs_createf(&show_fci_online.ops,
&fake_cpu_infos[cpu], 0644,
"/sys/devices/system/cpu/cpu%d/online", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_createf failed\n");
}
/* link to cpu%d */
error = sysfs_lookupf(&targeth,
"/sys/devices/system/cpu/cpu%d", cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_lookupf failed\n");
}
error = sysfs_symlinkf(targeth, "/sys/bus/cpu/devices/cpu%d",
cpu);
if (error) {
panic("cpu_sysfs_setup:sysfs_symlinkf failed\n");
}
}
return;
} /* cpu_sysfs_setup() */

View File

@ -23,6 +23,7 @@
extern int num_processors;
struct cpu_local_var *clv;
static int cpu_local_var_initialized = 0;
void cpu_local_var_init(void)
{
@ -31,11 +32,24 @@ void cpu_local_var_init(void)
z = sizeof(struct cpu_local_var) * num_processors;
z = (z + PAGE_SIZE - 1) >> PAGE_SHIFT;
clv = allocate_pages(z, IHK_MC_AP_CRITICAL);
clv = ihk_mc_alloc_pages(z, IHK_MC_AP_CRITICAL);
memset(clv, 0, z * PAGE_SIZE);
cpu_local_var_initialized = 1;
}
struct cpu_local_var *get_cpu_local_var(int id)
{
return clv + id;
}
void preempt_enable(void)
{
if (cpu_local_var_initialized)
--cpu_local_var(no_preempt);
}
void preempt_disable(void)
{
if (cpu_local_var_initialized)
++cpu_local_var(no_preempt);
}

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4

View File

@ -26,10 +26,14 @@ SECTIONS
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4
@ -39,8 +43,4 @@ SECTIONS
. = ALIGN(4096);
_end = .;
/DISCARD/ : {
*(.eh_frame)
*(.note.gnu.build-id)
}
}

View File

@ -1,6 +1,5 @@
CC = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-gcc
LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/attached-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -3,6 +3,5 @@ LD = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-ld
OBJDUMP = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objdump
OBJCOPY = /usr/linux-k1om-4.7/bin/x86_64-k1om-linux-objcopy
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -m elf_k1om -T $(SRC)/config/builtin-mic.lds
LDFLAGS_MKIMAGE = -m elf_k1om

View File

@ -1,2 +1 @@
CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
LDFLAGS += -T $(SRC)/config/builtin-x86.lds

View File

@ -0,0 +1 @@
LDFLAGS += -T $(SRC)/config/smp-x86.lds

45
kernel/config/smp-x86.lds Normal file
View File

@ -0,0 +1,45 @@
PHDRS
{
text PT_LOAD FLAGS(5);
data PT_LOAD FLAGS(7);
}
SECTIONS
{
. = 0xffffffff80001000;
_head = .;
.text : {
*(.text);
} : text
. = ALIGN(4096);
.data : {
*(.data)
*(.data.*)
} :data
.rodata : {
*(.rodata .rodata.*)
} :data
.vsyscall : ALIGN(0x1000) {
vsyscall_page = .;
. = vsyscall_page + 0x000;
*(.vsyscall.gettimeofday)
*(.vsyscall.gettimeofday.*)
. = vsyscall_page + 0x400;
*(.vsyscall.time)
. = vsyscall_page + 0x800;
*(.vsyscall.getcpu)
. = ALIGN(4096);
} : data = 0xf4
.bss : {
*(.bss .bss.*)
}
. = ALIGN(4096);
_end = .;
}

View File

@ -22,13 +22,46 @@ extern int vsnprintf(char *buf, size_t size, const char *fmt, va_list args);
extern int sprintf(char * buf, const char *fmt, ...);
static ihk_spinlock_t kmsg_lock;
static unsigned long kprintf_lock_head(void);
static void kprintf_unlock_head(unsigned long irqflags);
static void kprintf_wait(int len, unsigned long *flags_head, int *slide) {
int head, tail, buf_len, mode, adj;
mode = kmsg_buf.mode;
while (1) {
adj = 0;
tail = kmsg_buf.tail;
buf_len = kmsg_buf.len;
head = kmsg_buf.head;
if (head < tail) head += buf_len;
if (tail + len > buf_len) adj = buf_len - tail;
if (head > tail && head <= tail + len + adj) {
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer */
if (mode != 1) {
*slide = 1;
break;
} else {
kprintf_unlock_head(*flags_head);
*flags_head = kprintf_lock_head();
}
} else {
break;
}
}
}
/* TODO: lock */
void kputs(char *buf)
{
int len = strlen(buf);
unsigned long flags;
int slide = 0;
unsigned long flags_tail, flags_head;
flags = ihk_mc_spinlock_lock(&kmsg_lock);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
if (len + kmsg_buf.tail > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -39,60 +72,57 @@ void kputs(char *buf)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
/* When proceeding tail (producer pointer) by len would
cross head (consumer pointer) in ring-buffer, give up
[head, tail] because the range is overwritten */
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
}
#define KPRINTF_LOCAL_BUF_LEN 1024
int kprintf_lock()
unsigned long kprintf_lock(void)
{
return ihk_mc_spinlock_lock(&kmsg_lock);
return __ihk_mc_spinlock_lock(&kmsg_lock);
}
void kprintf_unlock(int irqflags)
void kprintf_unlock(unsigned long irqflags)
{
ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
__ihk_mc_spinlock_unlock(&kmsg_lock, irqflags);
}
static unsigned long kprintf_lock_head(void)
{
return __ihk_mc_spinlock_lock(&kmsg_buf.lock);
}
static void kprintf_unlock_head(unsigned long irqflags)
{
__ihk_mc_spinlock_unlock(&kmsg_buf.lock, irqflags);
}
/* Caller must hold kmsg_lock! */
int __kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
va_list va;
unsigned long flags;
char buf[KPRINTF_LOCAL_BUF_LEN];
flags = ihk_mc_spinlock_lock(&kmsg_lock);
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
@ -100,16 +130,69 @@ int kprintf(const char *format, ...)
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
ihk_mc_spinlock_unlock(&kmsg_lock, flags);
kprintf_unlock_head(flags_head);
return len;
}
int kprintf(const char *format, ...)
{
int len = 0;
int slide = 0;
va_list va;
unsigned long flags_tail, flags_head;
char buf[KPRINTF_LOCAL_BUF_LEN];
/* Copy into the local buf */
len = sprintf(buf, "[%3d]: ", ihk_mc_get_processor_id());
va_start(va, format);
len += vsnprintf(buf + len, KPRINTF_LOCAL_BUF_LEN - len - 2, format, va);
va_end(va);
flags_tail = kprintf_lock();
flags_head = kprintf_lock_head();
kprintf_wait(len, &flags_head, &slide);
/* Append to kmsg buffer */
if (kmsg_buf.tail + len > kmsg_buf.len) {
kmsg_buf.tail = 0;
}
memcpy(kmsg_buf.str + kmsg_buf.tail, buf, len);
kmsg_buf.tail += len;
if (slide == 1) {
kmsg_buf.head = kmsg_buf.tail + 1;
if (kmsg_buf.head >= kmsg_buf.len) kmsg_buf.head = 0;
}
kprintf_unlock_head(flags_head);
kprintf_unlock(flags_tail);
return len;
}
void kmsg_init(void)
/* mode:
0: mcklogd is not running.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
1: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer blocks until
someone retrieves kmsg.
2: mcklogd periodically retrieves kmsg.
When kmsg buffer is full, writer doesn't block
and overwrites the buffer.
*/
void kmsg_init(int mode)
{
ihk_mc_spinlock_init(&kmsg_lock);
kmsg_buf.tail = 0;
kmsg_buf.len = sizeof(kmsg_buf.str);
kmsg_buf.head = 0;
kmsg_buf.mode = mode;
ihk_mc_spinlock_init(&kmsg_buf.lock);
memset(kmsg_buf.str, 0, kmsg_buf.len);
}

View File

@ -3,7 +3,8 @@
* License details are found in the file LICENSE.
* \brief
* memory mapped device pager client
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com>
* \author Gou Nakamura <go.nakamura.yw@hitachi-solutions.com> \par
* Copyright (C) 2014 RIKEN AICS
*/
/*
* HISTORY:
@ -32,9 +33,18 @@
#include <pager.h>
#include <string.h>
#include <syscall.h>
#include <process.h>
//#define DEBUG_PRINT_DEVOBJ
#ifdef DEBUG_PRINT_DEVOBJ
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
#else
#define dkprintf(...) do { if (0) kprintf(__VA_ARGS__); } while (0)
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
#define dkprintf(...)
#define ekprintf(...) kprintf(__VA_ARGS__)
struct devobj {
struct memobj memobj; /* must be first */
@ -68,51 +78,52 @@ static struct memobj *to_memobj(struct devobj *devobj)
/***********************************************************************
* devobj
*/
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp)
int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxprotp,
int prot, int populate_flags)
{
ihk_mc_user_context_t ctx;
struct pager_map_result result; // XXX: assumes contiguous physical
int error;
struct devobj *obj = NULL;
const size_t npages = (len + PAGE_SIZE - 1) / PAGE_SIZE;
const size_t pfn_npages = (npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
kprintf("devobj_create(%d,%lx,%lx)\n", fd, len, off);
#define MAX_PAGES_IN_DEVOBJ (PAGE_SIZE / sizeof(uintptr_t))
if (npages > MAX_PAGES_IN_DEVOBJ) {
error = -EFBIG;
kprintf("devobj_create(%d,%lx,%lx):too large len. %d\n", fd, len, off, error);
goto out;
}
dkprintf("%s: fd: %d, len: %lu, off: %lu \n", __FUNCTION__, fd, len, off);
obj = kmalloc(sizeof(*obj), IHK_MC_AP_NOWAIT);
if (!obj) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):kmalloc failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu kmalloc failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj, 0, sizeof(*obj));
obj->pfn_table = allocate_pages(1, IHK_MC_AP_NOWAIT);
obj->pfn_table = ihk_mc_alloc_pages(pfn_npages, IHK_MC_AP_NOWAIT);
if (!obj->pfn_table) {
error = -ENOMEM;
kprintf("devobj_create(%d,%lx,%lx):allocate_pages failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu allocating PFN failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
memset(obj->pfn_table, 0, 1*PAGE_SIZE);
memset(obj->pfn_table, 0, pfn_npages * PAGE_SIZE);
ihk_mc_syscall_arg0(&ctx) = PAGER_REQ_MAP;
ihk_mc_syscall_arg1(&ctx) = fd;
ihk_mc_syscall_arg2(&ctx) = len;
ihk_mc_syscall_arg3(&ctx) = off;
ihk_mc_syscall_arg4(&ctx) = virt_to_phys(&result);
ihk_mc_syscall_arg5(&ctx) = prot | populate_flags;
error = syscall_generic_forwarding(__NR_mmap, &ctx);
if (error) {
kprintf("devobj_create(%d,%lx,%lx):map failed. %d\n", fd, len, off, error);
kprintf("%s: error: fd: %d, len: %lu, off: %lu map failed.\n",
__FUNCTION__, fd, len, off);
goto out;
}
kprintf("devobj_create:handle: %lx\n", result.handle);
kprintf("devobj_create:maxprot: %x\n", result.maxprot);
dkprintf("%s: fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x\n",
__FUNCTION__, fd, len, off, result.handle, result.maxprot);
obj->memobj.ops = &devobj_ops;
obj->memobj.flags = MF_HAS_PAGER;
@ -130,11 +141,12 @@ int devobj_create(int fd, size_t len, off_t off, struct memobj **objp, int *maxp
out:
if (obj) {
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(obj);
}
kprintf("devobj_create(%d,%lx,%lx): %d %p %x%d\n", fd, len, off, error, *objp, *maxprotp);
dkprintf("%s: ret: %d, fd: %d, len: %lu, off: %lu, handle: %p, maxprot: %x \n",
__FUNCTION__, error, fd, len, off, result.handle, result.maxprot);
return error;
}
@ -142,7 +154,7 @@ static void devobj_ref(struct memobj *memobj)
{
struct devobj *obj = to_devobj(memobj);
kprintf("devobj_ref(%p %lx):\n", obj, obj->handle);
dkprintf("devobj_ref(%p %lx):\n", obj, obj->handle);
memobj_lock(&obj->memobj);
++obj->ref;
memobj_unlock(&obj->memobj);
@ -154,8 +166,10 @@ static void devobj_release(struct memobj *memobj)
struct devobj *obj = to_devobj(memobj);
struct devobj *free_obj = NULL;
uintptr_t handle;
const size_t pfn_npages =
(obj->npages / (PAGE_SIZE / sizeof(uintptr_t))) + 1;
kprintf("devobj_release(%p %lx)\n", obj, obj->handle);
dkprintf("devobj_release(%p %lx)\n", obj, obj->handle);
memobj_lock(&obj->memobj);
--obj->ref;
@ -182,19 +196,19 @@ static void devobj_release(struct memobj *memobj)
}
if (obj->pfn_table) {
free_pages(obj->pfn_table, 1);
ihk_mc_free_pages(obj->pfn_table, pfn_npages);
}
kfree(free_obj);
}
kprintf("devobj_release(%p %lx):free %p\n",
dkprintf("devobj_release(%p %lx):free %p\n",
obj, handle, free_obj);
return;
}
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp)
static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintptr_t *physp, unsigned long *flag)
{
const off_t pgoff = off >> PAGE_SHIFT;
const off_t pgoff = off / PAGE_SIZE;
struct devobj *obj = to_devobj(memobj);
int error;
uintptr_t pfn;
@ -202,15 +216,15 @@ static int devobj_get_page(struct memobj *memobj, off_t off, int p2align, uintpt
ihk_mc_user_context_t ctx;
int ix;
kprintf("devobj_get_page(%p %lx,%lx,%d)\n", memobj, obj->handle, off, p2align);
dkprintf("devobj_get_page(%p %lx,%lx,%d)\n", memobj, obj->handle, off, p2align);
if ((pgoff < obj->pfn_pgoff) || ((obj->pfn_pgoff + obj->npages) <= pgoff)) {
error = -EFBIG;
kprintf("devobj_get_page(%p %lx,%lx,%d): out of range. %d\n", memobj, obj->handle, off, p2align, error);
kprintf("%s: error: out of range: off: %lu, page off: %lu obj->npages: %d\n", __FUNCTION__, off, pgoff, obj->npages);
goto out;
}
ix = pgoff - obj->pfn_pgoff;
kprintf("ix: %ld\n", ix);
dkprintf("ix: %ld\n", ix);
memobj_lock(&obj->memobj);
pfn = obj->pfn_table[ix];
@ -230,12 +244,20 @@ kprintf("ix: %ld\n", ix);
if (pfn & PFN_PRESENT) {
/* convert remote physical into local physical */
kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT before %#lx\n", memobj, obj->handle, off, p2align, pfn);
dkprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT before %#lx\n", memobj, obj->handle, off, p2align, pfn);
attr = pfn & ~PFN_PFN;
/* TODO: do an arch dependent PTE to mapping flag conversion
* instead of this inline check, also, we rely on having the
* same PAT config as Linux here.. */
if ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD)) {
*flag |= VR_WRITE_COMBINED;
}
pfn = ihk_mc_map_memory(NULL, (pfn & PFN_PFN), PAGE_SIZE);
pfn &= PFN_PFN;
pfn |= attr;
kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->handle, off, p2align, pfn);
dkprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->handle, off, p2align, pfn);
}
memobj_lock(&obj->memobj);
@ -253,6 +275,6 @@ kprintf("devobj_get_page(%p %lx,%lx,%d):PFN_PRESENT after %#lx\n", memobj, obj->
*physp = pfn & PFN_PFN;
out:
kprintf("devobj_get_page(%p %lx,%lx,%d): %d %lx\n", memobj, obj->handle, off, p2align, error, *physp);
dkprintf("devobj_get_page(%p %lx,%lx,%d): %d %lx\n", memobj, obj->handle, off, p2align, error, *physp);
return error;
}

Some files were not shown because too many files have changed in this diff Show More