Compare commits

..

358 Commits

Author SHA1 Message Date
63ed4e7af0 spec: prerelase 0.7 for testing hugetlb map for stack
Change-Id: I4997340cd984ca8915e45749b91b1d72c1de85af
2020-07-20 08:11:40 +09:00
d7cf39883f Revert "shmobj: Support large page"
This reverts commit 9a60997ea0.

Change-Id: Id60959b4e03451987239faa0bbc2e780b72fafaa
2020-07-19 12:53:45 +00:00
40f8091fab stack: grow on page fault
The steps of the technique to replace stack with hugetlbfs map are as
follows:

(1) Prepare a hugetlbfs map with the size of rlim_cur
(2) Copy the active region of the stack to the hugetlbfs map.
    The range to copy is determined by reading /proc/[pid]/maps.
(3) Replace the stack map with the hugetlbfs map

The step (2) tries to copy a huge region if McKernel doesn't grow the
stack at run-time.

Change-Id: I5858c35b5c26dd0a42cccf9e3cc4c64b1a81f160
2020-07-19 12:53:31 +00:00
a20e1acf01 syscall: add prlimit64
Change-Id: Iad882813d54b439c236c0df74dc81508190e6707
2020-07-19 21:52:46 +09:00
b3d7bbda56 rus_vm_fault: compat: RHEL-8.2
This applies the following patch:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=1c8f422059ae5da07db7406ab916203f9417e396
mm: change return type to vm_fault_t

Change-Id: I7189fc92824d21b4906f1033f1de5899bbad4680
2020-07-15 13:02:32 +09:00
9a60997ea0 shmobj: Support large page
Mixing page sizes is allowed by shmobj.

Change-Id: Ic48b71da2db6ce3f68fa3dbc8ad5ae96347d6018
Refs: #1381
Refs: #1458
2020-07-15 03:50:56 +00:00
4b66373813 mcexec: Don't forward SIGTSTP SIGTTIN SIGTTOUT to mckernel
Change-Id: I72bb74d6b98e1f0bf519c8f0fef742624a2a699a
Refs: #1425
2020-07-14 08:34:11 +00:00
b44b11ace7 set_robust_list: Add error check
set_robust_list is not supported by McKernel.

Change-Id: I1f679e2e4df24139cceb1f2294bc072cb7956002
Refs: 1399
2020-07-14 01:06:49 +00:00
ebc91cea0e tgkill: Fix argument validatation
Formerly, if tgid is specified as -1, tgkill() was equivalent to tkill().
Now it is treated as an error EINVAL.

Change-Id: I47bc75d439662a36dc6167c4446a5277422de507
Refs: 1380
2020-07-14 01:03:47 +00:00
58106d791a struct process: fix type of group_exit_status
Change-Id: Ib8492cbb077106cef1d0fa2d6d5e8e13bbb209c0
Refs: #1377
2020-07-13 08:33:07 +00:00
56b51d4f97 spec: prerelase 0.6 for testing cpuinfo and mmap overcommit
Change-Id: Iab5acc2c08ebe19251c37782cff87a4b5c914448
2020-07-13 10:14:23 +09:00
bafe540d86 mmap: allow unlimited overcommit
Change-Id: Iba07b5c504b4a202cd163ce682f3fc72a31284a0
2020-07-10 14:52:57 +09:00
d78a0fd05d sysinfo: support basic entries
Change-Id: I27f3e55058cc29f895831a1dddfafbc8585746a5
refs: #1389
2020-07-10 14:51:25 +09:00
999bc91b4f arch: Move some functions from arch-dependent to common part
Moved syscall rt_sigaction and functions related to signal.

Change-Id: I39f619e008d9c6018d91099a76dfb30e48757673
Refs: 1487
2020-07-10 03:54:28 +00:00
b3bd2ea9b3 procfs cpuinfo: use sequence number as processor
Change-Id: Id54ea74c5fda198a0bb9c9b6a19e6799fee0ed3f
2020-07-09 13:10:08 +09:00
d3d9e2400d test: ihklib: syscall_list.h: add robust marker for patch
Change-Id: Ie5f72b4b296db4d44e9839f38fd9a68854be78c3
2020-07-06 16:25:11 +09:00
199407b2a1 spec: prerelease 0.5 for testing ppoll
Change-Id: I51deb1c1703a986ba0aa4e02da9f53009554dbb7
2020-07-01 08:49:08 +09:00
5973d66e2d Revert "epoll_wait(): make sure to schedule in offload"
This reverts commit 5e44c9c9f9.

Change-Id: I826336f1ece31a84072c3e62c6c6c68a641e8fb5
2020-06-30 17:11:26 +09:00
d7ef74659b Revert "epoll, ppoll: deschedule on offload, don't do it when exiting system call"
This reverts commit d4056acfc3.

Change-Id: I7df15b9d3957ca571f4b4e2d576799f8b97ae299
2020-06-30 17:11:23 +09:00
ac86affecc mcexec: fix FLIB_AFFINITY_ON_PROCESS mask for McKernel CPU numbers (Fugaku)
Change-Id: If42b139fb53866bcff0809d898d4a2a712946f0c
2020-06-30 16:29:03 +09:00
2026cf8dad mcexec: explicit CPU list in partitoned execution (for Fujitsu's FLIB_AFFINITY_ON_PROCESS)
Change-Id: I05c11f73553de8ccb5f79083ce2115ac57e62584
2020-06-30 16:29:00 +09:00
1d135492c3 mcexec: detect mismatch of mcexec -n and mpirun -ppn
Change-Id: I0c42e3119143da40ea2e69cd9ec99bde78a0ad2a
Refs: #929
2020-06-30 16:28:08 +09:00
1cfc5ca71f spec: prerelease 0.4 for testing cross-compile
Change-Id: I26908b6b415483711f55338e45d7b2d862b5c028
2020-06-23 08:34:10 +00:00
7ee533d620 spec: remove unnecessary mcinspect*.debug file
Fixes: 612f364 "spec: include recently added debug tools"
Change-Id: I29779132567d18f9468e3cecf2c713ad1c51729b
2020-06-23 08:34:10 +00:00
28334c7a29 cmake: treat libdwarf as required library when cross-compiling
Change-Id: I23ffb46c867b05de0e732c96912d62c630ebb44c
2020-06-23 16:18:35 +09:00
697e9386b3 cmake: fix resovling dwarf.h
Fixes: 0e787b7 "cmake: fix resolving libdwarf"
Change-Id: Iccb491c8ad07db0f15f6b1798ee8a91edc808cf7
2020-06-22 13:33:50 +09:00
0e787b731e cmake: fix resolving libdwarf
Change-Id: I14573f1ac7d779b4c90ed44cc310d4f584374559
2020-06-19 17:24:21 +09:00
612f364e6a spec: include recently added debug tools
Change-Id: I0318fe3551a75c7da774d26bc834c099bb235b67
2020-06-19 13:37:52 +09:00
ceee4c379f spec: prerelease 0.3 for testing fixes related to Fujitsu TSC and ihkmond
Change-Id: I4b9fcac086a3567e6e797f3e7515949c9e214c36
2020-06-18 16:23:43 +09:00
36c981bc34 sync with ihk
Change-Id: I052394121016a030d8873296b4a17b1f038d6b13
2020-06-18 16:23:43 +09:00
fd941dad44 Revert "procfs cpuinfo: use sequence number as processor"
This reverts commit bb7e140655.

Change-Id: If0c1719986706511c1e57d06bc61923d1adfc0aa
2020-06-16 13:26:55 +09:00
5f5b9f94d1 Revert "get_one_cpu_topology: Renumber core_id (physical core id)"
This reverts commit 0a4e6b49b4.

Change-Id: Icd9f2cda63d0daf661a40b146c72608b82cf2061
2020-06-16 13:26:55 +09:00
3f3c4acd71 madvise: do nothing (workaround for Fugaku)
Change-Id: Id2265e7eca4ae296dd22a8e99a2294a9a8b4c4dc
2020-06-16 13:26:54 +09:00
00007dafaa mbind: do nothing (workaround for Fugaku)
Change-Id: Id9d018304e18ed52ea7b0a872e03675c903bce6e
2020-06-16 13:26:54 +09:00
cbe2b2149d Revert "sysinfo, procfs: Support memory info partially"
This reverts commit 8f74888f87.

Change-Id: I65530dd8a4e1af2ca47cb02c02f5c54a9b4595a5
2020-06-16 13:26:54 +09:00
4cecde3fba Revert "mcexec: detect mismatch of mcexec -n and mpirun -ppn"
This reverts commit 72af689e69.

Change-Id: I25bc56cd8ac9c877852fc1092c8349fe318fd25d
2020-06-16 13:26:54 +09:00
8022a2a8c0 treat libfj90 as helper thread spawner (Fugaku specific)
Change-Id: I1f6170c7ebbfae4f575f13ac1f3106d292cd5b6a
2020-06-16 13:26:53 +09:00
3328ce03d9 Record pthread routine address in clone(), keep helper threads on caller CPU core (workaround for Fugaku)
Change-Id: I29d1589e430dc1396558cdf3df4d068c27173612
2020-06-16 13:26:53 +09:00
97b107f61c treat /var/opt/FJSVtcs/ple/daemonif/ as device file (Fugaku specific)
Change-Id: I047ec793a082f2fede3f2bd9c5fb358a30b8ea84
2020-06-16 13:26:53 +09:00
6f3be17c19 do_process_vm_read_writev: don't check vm_range (workaround for Fugaku)
Change-Id: I4ce9b5397ed876dff651c67658e43811d83658dd
2020-06-16 13:26:53 +09:00
dea7d00545 force allow_oversubscribe (workaround for Fugaku)
Change-Id: I5288f5ccbd967004fabbe71bca267feed3b9c2f8
2020-06-16 13:26:53 +09:00
4512778569 force time_sharing (workaround for Fugaku)
Change-Id: Ie3e3a0bbf00ef4e988bdee40d9d4dc93258dd4be
2020-06-16 13:26:52 +09:00
a7adb266ff mcinspect: add read memory value by specifying physical address
Change-Id: I2f2d6cb981e883c5e2ae1e0c764e10e0fec76a46
2020-06-16 13:26:52 +09:00
2566f4f213 devobj_free: don't report error on release-offload failure
Change-Id: I4179dab8cc46557a72eb3447ff0803743a1ba1a2
2020-06-16 13:26:52 +09:00
ac0081eddd handle_interrupt_gicv3: don't take runq_lock
To avoid dead-lock with the function taking the lock with
ihk_mc_spinlock_lock_noirq().

Change-Id: If689e8cc5fff81f627bcf98bfa7df7d4c13f4209
2020-06-16 13:26:52 +09:00
d4056acfc3 epoll, ppoll: deschedule on offload, don't do it when exiting system call
Change-Id: Ib1d0553ca5c50f4de055a1a5fe40b406c9c26dc7
2020-06-16 13:26:52 +09:00
1910543380 armv8pmu_write_counter: sign-extend properly
ihk_mc_event_set_period() calls armv8pmu_write_counter() by
cpu_pmu.write_counter(..., (uint64_t)(-left) & max_period)

Change-Id: I2ac8fbe5957db044ac54946f620163e3c486cb5f
2020-06-16 13:26:51 +09:00
6332903f0d Revert "xpmem: Support large page attachment"
This reverts commit a8696d811d.

Conflicts:
	kernel/include/process.h
	kernel/syscall.c
	kernel/xpmem.c

Change-Id: I726e74450f6228d3fc78fc62dda15b2067732a53
2020-06-16 13:25:57 +09:00
29d27b7c8d Revert "xpmem: Use correct process_vm in xpmem functions"
This reverts commit 0c63a2a3cd.

Change-Id: I7a67def6c45a67396b15cc55e96ffb5fc5898f28
2020-06-16 13:25:51 +09:00
7136384384 Revert "xpmem: Make sure vm_range is used under memory_range_lock"
This reverts commit 91ea69cf8f.

Conflicts:
	kernel/xpmem.c

Change-Id: Iff3eed010ad3610d63e165f53484ac56528ce384
2020-06-16 13:22:49 +09:00
2fe5c8de2e Revert "xpmem: Fix deadlock in xpmem_remove_process_memory_range()"
This reverts commit d052acab1d.

Change-Id: I31e982465ef9e0936145f27c8d1587c01737ec81
2020-06-16 12:13:49 +09:00
e774e1b984 Revert "xpmem: fix mapping of attachment and segment"
This reverts commit a5fcc91656.

Change-Id: If29415369d724391b291939ecce76482138e82f5
2020-06-16 11:28:02 +09:00
33b7414615 Revert "xpmem: map only resident segment pages at attach time (workaround for Fugaku)"
This reverts commit 3c646e2485.

Change-Id: Ibae8100403586775a32d6eb36c74383131066ac9
2020-06-16 11:27:59 +09:00
3c646e2485 xpmem: map only resident segment pages at attach time (workaround for Fugaku)
Change-Id: I50ac8ba88b208608206b68b4c57e278041913503
2020-06-16 09:17:26 +09:00
a5fcc91656 xpmem: fix mapping of attachment and segment
* Mapping attached part of segment is done at attach time instead of
  make time to work with runtimes (e.g. OpenMPI) xpmem_make-ing the
  entire user-space
* Mapping attached part of segment at attach time can be turned off by
  specifying xpmem_remote_on_demand in kernel argument
* Mapping attachment chooses appropriate page-sizes, i.e., largest
  allowed by memory range and segment page boundary

Fixes: a8696d8 "xpmem: Support large page attachment"
Change-Id: I44663865204036520e5f62fe22b9134ee4629f9b
2020-06-15 10:11:29 +09:00
d370e9241f Toggle preemption while faulting pages
Change-Id: I74201061bb3e7c7c4032e3884658ace87cb85948
2020-06-15 10:11:29 +09:00
3e254c06bf SCD_MSG_WAKE_UP_SYSCALL_THREAD: hold target thread through wake-up
Change-Id: I35b2c56f78430135b2d197d2a2cfe364dbd03947
2020-06-15 10:11:29 +09:00
07537cd2e7 eclair-dump-backtrace: expect script to dump backtrace on all CPUs
Change-Id: I358c5d5ca81903b0eaab88d227c36373164c0950
2020-06-15 10:11:29 +09:00
a37f72da0e futex_wake(): disable IRQs while iterating plist
Change-Id: I796794b2159816183c6487ef0048f42f97aac73b
2020-06-15 10:11:28 +09:00
ab11b168f0 ptrace_setoptions: debug msg
Change-Id: Iea5fdb26884c7af6e3d5aa26b5f71932f730cc9d
2020-06-15 10:11:28 +09:00
eac414d6d8 CPU read/write reg: use generic IHK messaging interface
Change-Id: Ia9637d1516d9329fdadf37822bfce7594d69105f
2020-06-15 10:11:28 +09:00
bb725f5f50 crash: print actual PTE in lookup mode
Change-Id: Ie2c1b97780347d6172ef8961ed62258117cbf115
2020-06-15 10:11:28 +09:00
5224551782 mcinspect: vtop (in progress)
Change-Id: I09f487e96edc7c4f59c97e6fb6dde28baf84c1e5
2020-06-15 10:11:28 +09:00
91146acfe5 Make struct ihk_os_rusage compatible with mckernel_rusage (workaround for Fugaku)
Change-Id: Iebae1e8b0aaf9c23cb1c9411aa1ad111b2e61028
2020-06-15 10:10:57 +09:00
f64731ab34 do_migrate: kick scheduler on target CPU
Change-Id: Ib5875ecf0c6a3118d32973329a6f1595a910562f
2020-06-15 09:58:55 +09:00
cd46cbd4b3 mcinspect and mcps: DWARF based LWK inspection
Change-Id: Ie9e209d8f77999b61afa39c38832bfc416a2c34f
2020-06-15 09:58:54 +09:00
39780917af libdwarf: compile locally if not present
Change-Id: I70d1f653f4fc4ee4daeaa2c9c6bdbf1416e43c9b
2020-06-15 09:58:52 +09:00
0f8f6d298e CMakeLists.txt: fail on missing libraries at config time
Change-Id: Ia7e4cf469d94f97fa1c565e59d2d4587f3a3d081
2020-06-13 17:18:10 +09:00
f8e8b21f04 /dev/shm: use Linux PFNs and populate mappings
Change-Id: I921c1f43c8411f896343be17e0ac6762a1bc26d1
2020-06-13 17:18:10 +09:00
5c2f9b8239 pager: prefetch all shared libraries
Change-Id: Ic62e1284d540362df817098b3926ac223245e3b6
2020-06-13 17:18:10 +09:00
1afc3d9b70 Keep track of number of context switches per CPU
Change-Id: I7a2194c8777a7efcd34e1ed7f4734da03fb4d433
2020-06-13 17:18:10 +09:00
17a8f68d60 set_timer(): treat spin wait as PS_RUNNING
Change-Id: Iea1ad5b0a49a12d5e1aef38ad68fccb8d789af5e
2020-06-13 17:18:10 +09:00
2b9a053504 syscall offload: avoid double IRQ enabling
Change-Id: I202c9f348b66672b1c9f8c146d4e28ec1d9c7658
2020-06-13 17:18:09 +09:00
6441aa1abb __sched_wakeup_thread(): check if timesharing needs to be enabled
Change-Id: I081d700f345abbbdb14dcac3b6246b79475d059b
2020-06-13 17:18:09 +09:00
9b55b68934 Allow other threads to run while waiting for I/O in page faults
Change-Id: I51e847a02a698b0ecf1e356d51599aa1c9400b15
2020-06-13 17:18:09 +09:00
83ef96a739 fileobj: disable IRQs while holding page hash locks, schedule() in I/O loop
Change-Id: Iaf72d55980f1a5df6c93c4a57fa57b0ae5b1d229
2020-06-13 17:18:09 +09:00
b5337358cf IKC: increase message queue sizes
Change-Id: Ib1eee4d26b8304cbee16fe50caabfc2c19e5c2e3
2020-06-13 17:18:09 +09:00
2db3717e57 handle_interrupt_gicv3(): check for CPU_FLAG_NEED_RESCHED as well
Change-Id: Id6ade08e4e572a6d837476de2872126442d3591c
2020-06-13 17:18:09 +09:00
5395891966 pager_req_map: fix printk
Change-Id: I98488169f02656c2df711b827d0002762de69f7a
2020-06-13 17:18:09 +09:00
c32a5e261b PF handler: print VM range's file path if available
Change-Id: I5ba55b19a0b874bc9f4b58e94bfc4afc440e6a8a
2020-06-13 17:18:09 +09:00
c0c80b71ca mmap and fileobj: handle MF_ZEROFILL properly
Change-Id: I6ee52b4cab212b1973339bc8d49065c1ec9263b0
2020-06-13 17:18:09 +09:00
d15a396d5a pager: use host physical for PMIx shared memory
Change-Id: Idfebc768ba03b5536a0e5eb1c6076769806fa7aa
2020-06-13 17:18:08 +09:00
e35ec09da1 UCX: fix page size for shared memory
Change-Id: I75b0beef8345b391e7619887765ed1a89d74c29b
2020-06-13 17:18:08 +09:00
5e44c9c9f9 epoll_wait(): make sure to schedule in offload
Change-Id: I435416cb0ac005a03cd995bf1aae75c9ce7b2082
2020-06-13 17:18:08 +09:00
0f6c36870c mcexec_syscall(): disable no per-process structure warning
Change-Id: I951575f0077054ebcfe4b3f7e29416799ab6ade8
2020-06-13 17:18:08 +09:00
2ec2112cc5 IKC: use atomic allocation during initialization
Change-Id: I5bb5d7040092d47e4cdbdad87f9d1dd5b2ceaee5
2020-06-13 17:18:08 +09:00
c86a38e18f physical memory: guard rbtree allocator with IHK_RBTREE_ALLOCATOR macro
Change-Id: I468c6bf1f641875c02b091704ef63f59fd390be5
2020-06-13 17:18:08 +09:00
6aa7b50e26 profile: refactor display code and fix ARM support
Change-Id: Ic48102c42abe17eed014f2bfe7523d0d6f03c2e9
2020-06-13 17:18:08 +09:00
c3c57940ba Memory ordering and usage of ASM cmpxchg() instead of compiler atomic intrinsics
Change-Id: I4dadebc32721744dad982f3fc5b3eea7ab7ca745
2020-06-13 17:18:08 +09:00
7aa2d64294 obtain_clone_cpuid(): avoid locking while partitioned execution
Change-Id: Iabb4784835be7dc9b2f555acc3a711fcc23ee7da
2020-06-13 17:18:08 +09:00
51fe77cdae mmap()/shmget(): use Linux huge page size when not specified
Fixes: 089b443 "mmap()/shmget(): use Linux default huge page size when not specified"
Change-Id: If8043a0993d1131ea0344aa6d500b35c7a291884
2020-06-13 17:18:08 +09:00
d5aafca1ae VM: use RW spinlock for vm_range_lock
Change-Id: Id4654084207d55bf77cc9f8b42795e0f9873cfa0
2020-06-12 03:07:33 +00:00
54b529c82d An arch independent RW spinlock implementation
Change-Id: I426d3f7b643660e6685b5c39c0ae849a9f08b9bb
2020-06-12 03:07:33 +00:00
232bc9c44b README.md: add how to checkout to specific branch or version
Change-Id: Ie727c266d576e601f4901e2f84b98c07ff49aa24
2020-06-11 18:45:52 -04:00
f34373d1c0 README.md: add how to install with rpm
Change-Id: Ic3c0ff6971686d6d64dfcdd5850ae4a70f05f40f
2020-06-11 04:38:08 -04:00
4698ae166c spec: prerelease for testing hugefileobj premap fix
Test target: a2adb0a4 "hugefileobj: rewrite page allocation/handling"

Change-Id: Ibbae5222f54704248911da9f53ca8e4675627bc4
refs: #1475
2020-06-11 04:22:48 -04:00
db9ca358f9 sync with ihk
Change-Id: I769880c52c8cfd06523cea8d77cce5703e783532
2020-06-11 13:51:44 +09:00
16a6a1d08b mcexec: Fix LD_PRELOAD string manipulation (again)
Fixes: 8cf70900 "mcexec: Fix LD_PRELOAD string manipulation"
Change-Id: I6e0188bd60f8e3977beb22c1f9212baf37f37093
2020-06-05 09:25:15 +00:00
2e2e973d78 hugefileobj: rewrite page allocation/handling
* manage pages by an array
* fix mmap of fd created by memfd_create() populates the map
* refactor pgsize and pgshift handling

Change-Id: Icaf015b10afc35f2b95f93059adf1a1b6b92e14e
refs: #1475
2020-05-19 23:36:25 -04:00
c3c0b7197f test: perf: prevent overflow counter from stopping counter
Fixes: 1a204b6 "perf: overflow test"
Change-Id: I4d8e93b97f7a8d58ef7811f55b5c995b16c5af69
2020-05-14 01:10:14 +00:00
d086100b35 perf: REFRESH: Don't perform perf_start
Change-Id: I70194467d357770f982d90a6f9b132a61a817fc5
2020-05-14 01:09:52 +00:00
8f74888f87 sysinfo, procfs: Support memory info partially
Change-Id: I597dae4f82d64d3f23889cef960db18ae879ff06
refs: #1389
2020-05-14 00:53:25 +00:00
8e42c2a254 README.md: Add description of Utility Thread offloading Interface (UTI)
Change-Id: Ibeb6e6b91e5f280214e7f78049b6f35e648198c7
2020-05-12 14:14:08 +09:00
caf0f5ef63 cmake: do NOT install crash plugin sources
Fixes "Installed (but unpackaged) file(s) found" rpmbuild error.

Fixes: 04d17dd3 "Define MAP_KERNEL_START by resolving MODULES_END at cmake time"
Change-Id: I80df58ac3c581faf1c48080115b70724eac6aea5
2020-04-20 18:51:15 -04:00
3d030391e8 spec: Update version number to 1.7.0rc4
Change-Id: I1c999cfa632711195a9c8ec9de769075292c40b9
2020-04-17 11:57:53 +09:00
0aeab6b840 NEWS.md: Add 1.7.0rc4 updates
Change-Id: I66ccbe5e8454482155243b89d9b0398994186010
2020-04-17 02:43:56 +00:00
367bbda713 mcexec: Fix resolving library path for LD_PRELOAD
Fixes: 8ee1d61d "Revert "Detect hang of McKernel in mcexec""
Fixes: b87ac8b8 "reproductible builds: remove most install paths in c code"
Change-Id: I8ef9ab81cd0a41ccd0e227ebc3e45c0745c150e9
2020-04-16 20:46:46 +09:00
0082447043 mcctrl_get_request_os_cpu: Fix debug message
Change-Id: I0d2ae427b97b7284d61dd13825d4ba3d2130f26a
2020-04-16 07:44:36 +09:00
4f50c90f6e __mcctrl_os_read_write_cpu_register: Range-check cpu number
Change-Id: I9ef991e1f0a7e301430586c261bf55bf73a4bae9
2020-04-16 07:44:36 +09:00
79950e045e eclair: Improve error message
Change-Id: Ib8fe3df0a529a17a2e331b16cf396915ab6a3eb2
2020-04-16 07:44:36 +09:00
6cf7cebb2d __mcctrl_control: Check user privilege
Change-Id: Ia87ab241f980ea25df805bd31d66f07bf3681311
2020-04-16 07:44:36 +09:00
c9f05f238d Remove unused IHK_OS_STATUS_STOPPED
Change-Id: I4aad8dac06b79a85ca8951cc26c40981c64262bb
2020-04-16 07:28:20 +09:00
f1caaa9b74 freeze: arm64: use normal interrupt instead of NMI
Fixes: 55faba7 "dump: rewrite NMI handling (for resume) and fix PANIC register saving"
Fixes: ff982b8 "freeze: change freeze-thaw to normal interrupt"
Change-Id: I9445cac191f91d20357cae11b2839e4e9384ac6f
2020-04-15 01:04:20 +00:00
97cd379ee2 mcctrl_os_shutdown_notifier: Move wait for running state to ihk side
Change-Id: I363391c63d92d952fc9a60c1e88f964eb50687fd
2020-04-15 00:54:00 +00:00
8ee1d61d0f Revert "Detect hang of McKernel in mcexec"
Change-Id: Ie8a0cf725f84a2f5d85da8b8fb15b30a826ddfcb
2020-04-15 00:50:55 +00:00
04d17dd3e9 Define MAP_KERNEL_START by resolving MODULES_END at cmake time
Change-Id: Ib88fc045b64c4ad2dad6a4b13cb0372a735a26ab
2020-04-09 00:30:05 -04:00
33eef71133 spec: Update version number to 1.7.0rc3
Change-Id: Id07122ececb562ecb4e4cf91e4983b8273c96b34
2020-04-09 00:06:17 -04:00
c10b4a1c16 spec: fix mckernel-devel package
Also fixes kernel-rpm-macro package resolution issue.

Fixes: 6d584fea "spec: Add mckernel-devel package"
Change-Id: Ide286753c89c3b931665f53dd8270427b19b39eb
2020-04-08 00:25:43 -04:00
8cf70900e7 mcexec: Fix LD_PRELOAD string manipulation
To suppress compiler warnings.

Change-Id: I4d6b5ce2d2a8fca3f2675a7fc309df40cfe3c04b
2020-04-01 01:18:10 -04:00
b2618a98f5 madvise: Support MADV_DONTDUMP and MADV_DODUMP on anonymous map
Change-Id: I231b62ed6803b797ec749ac70a66cdf8236204bd
refs: #1373
2020-03-23 13:06:26 +09:00
01d06cb218 madvise: Add locked-page check to MADV_REMOVE
Change-Id: I95465ef11aa4c772ad0ecf5d25f757192f31b93b
refs: #1372
2020-03-23 13:06:26 +09:00
c78803ac08 madvise: Support MADV_REMOVE on tmpfs
Change-Id: Ic99d374c4d2630944c7bc838937d7f45601783c6
refs: #1371
2020-03-23 13:06:26 +09:00
3300e65efc madvise: Support MADV_WIPEONFORK, MADV_KEEPONFORK and MADV_NORMAL
Change-Id: I1d4cf5affa580d7304dfdc34fa4f1707c0df617c
refs: #1374
2020-03-23 09:13:01 +09:00
d82ac31bc6 faccessat: Specify AT_SYMLINK_NOFOLLOW only when necessary.
- Specify AT_SYMLINK_NOFOLLOW in faccessat only when
   the symbolic-link is analyzed by overlay_path().

Change-Id: Ie3b1f7fedef7441fd4b39c5c8b2ef0f73cba770e
Refs: #1370
2020-03-20 00:22:50 +00:00
4946fbdd82 Fix "test: runq_lock and over-scheduling fix."
Change-Id: Iedd3b94d6ecd52b9ee67cc9b8a75735428c9fd84
Refs: #1400
2020-03-19 23:34:40 +00:00
33cba1ad48 test: ptrace: Record syscall return value before reporting
Change-Id: I8e9de3bb9bfa0b07eebe472131cc62b53ef5cc8b
Refs: #1287
2020-03-19 23:31:48 +00:00
7c69cfaf67 set_host_vma(): do NOT read protect Linux VMA
Change-Id: Id1e84464c9a06a3886b9cb16b35b1f2dda3c4c30
2020-03-19 02:15:29 +00:00
b3cbdeec84 Fix memory leak when a child exits without wait()-ed
Change-Id: I8ad9e20e3f3e6f406548a6c4de2bf4dc07c40b0e
Refs: #1349
2020-03-16 04:26:54 +00:00
1d1ec39a27 exec: Correct wrong "=" to "+=".
Change-Id: Iec8c1bb7a12ad7f2e1d4ac07c75482e4d86a0ea2
Refs: #1382
2020-03-16 04:16:03 +00:00
0a4e6b49b4 get_one_cpu_topology: Renumber core_id (physical core id)
Change-Id: I4e4857e9a063d16d19d73adfabfc18a4b461bbfb
Refs: #1439
2020-03-12 05:19:25 +00:00
bb7e140655 procfs cpuinfo: use sequence number as processor
Change-Id: Idbfa48e9b60c03495d7ba72e962c55f0ffb8bec9
2020-03-12 05:19:25 +00:00
32b32f0c4a eclair: query phys memstart on arm64
Change-Id: I32db1153f5c1e4a217db69d8d55f0d0ccfa07c77
2020-03-12 10:53:41 +09:00
bf7fd81c1b Fix includes to handle module ref counter properly
Change-Id: If3f067a14e40c346f0455f8bfb8bbc8ab2934e88
2020-03-12 10:24:01 +09:00
92d191de9e xpmem: handle size 0xffffffffffffffff
Change-Id: I04fbe21966f8a831337576a14119afefe8a2ea4f
2020-03-09 16:26:09 +09:00
baf68f7e71 mcreboot: fix ETCDIR path (cmake 3.14.5 prepends etc)
Change-Id: Ib449ef294ddaf4a4d050d705fd05b8ede8b8150d
2020-03-09 07:21:10 +00:00
26bebb2749 sched_request_migrate(): fix race condition between migration req and IRQs
make sure the caller thread holds migration queue lock with IRQs disabled
until it notifies the target CPU so that an interrupt can not deschedule
it in the middle of the request.

Change-Id: I85995018ca1e8478ccc9723985b6e8efc9c3acfb
2020-03-09 07:05:15 +00:00
9e2196c9ce fix: memory leak due to forced termination during startup
Change-Id: Ide519f01702bfd17ae4576e04806b6d155ae846a
refs: #1397
2020-03-09 01:10:38 +00:00
93581cb142 test: runq_lock and over-scheduling fix.
Change-Id: I236ab585403076d716be350c8b51e8d352122f2b
Refs: #1400
2020-03-05 15:57:57 +09:00
67f5a1d4e0 migrate-cpu: Prevent migration target from calling schedule() twice
Symptom:
A thread could call schedule() twice.

Cause:
 (1) The migrator raises rescheduling flag
 (2) The thread calls check_need_resched() for other
     reason than the migrate IPI, e.g, response to system call
     offload. And it finds that the flag is set and it's trying to
     call schedule().
 (3) The thread is interrupted by the migrate IPI and it finds that
     the flag is set and calls schedule() in the interrupt context.
 (4) The thread resumes the execution and call schedule()

Solution:
 (1) Reset the rescheduling flag when checking it and it's set
 (2) Set it again if it's decided not to call schedule()

Change-Id: I5376662d0b02ca4ebb29b42732e347f3b82d766d
Refs: #1400
2020-03-05 15:51:28 +09:00
edf7b36669 runq_lock: Fix deadlock due to cpu migration.
Symptom and analysis:
runq_lock of the migration source is acquired on
the migration destination CPU.

This happens in the following steps:
 (1) The thread stores value of cpu_local_var(runq_lock)
     to its register when trying to perform
     ihk_mc_spinlock_lock() on the lock variable.
 (2) The thread takes IPI and migrates to another CPU.
 (3) The thread resumes execution and acquires the wrong lock.

Solution:
* Disable interrupts before getting the value of
  cpu_local_var(runq_lock)

Change-Id: Ia0ea450b97f872dd6116252537e4a79f85adfc88
Refs: #1400
2020-03-05 01:51:40 +00:00
1a204b6674 perf: overflow test
Change-Id: Ic7aa0d99ae9a5b7d3ce4436129a360275e6937ca
refs: #1358
2020-03-03 15:55:13 +09:00
305511b48f perf: accumulate counter in overflow handler
Change-Id: If5f5a913e0fde889d1835ffb16c19ea0ad5e685a
2020-03-03 13:23:30 +09:00
606db376fd perf: fix perf_reset
Change-Id: I98122b0f9866bc1cc8713e7bd46fa879917ac6a0
2020-03-03 13:23:30 +09:00
5719b4c64a perf: update event structure
Change-Id: I5bc0fdd42db509b5d2daca7d97e29ad1f7d11f1a
2020-03-03 13:23:30 +09:00
343121c3d0 perf: set event period
Change-Id: Ibf569de7af8697e766c10b8d70905b8cdc4df083
2020-03-03 13:23:30 +09:00
86c45484e3 perf: add struct hw_perf_event
Change-Id: I0938e2b18064ad805a9edb6e15d26cf438bf0a59
2020-03-03 13:23:29 +09:00
767792808a perf: change count variable type to ihk_atomic64_t
Change-Id: I2bb6fab2c040683830b44fa6b963a86a233b883a
2020-03-03 13:23:29 +09:00
117f070fd6 perf: fix PERF_EVENT_IOC_REFRESH
Change-Id: Ia5d3fbe344346aabc3b5d40a801b3c21cfbaac97
2020-03-03 13:23:29 +09:00
a27909be88 ihk_atomic64_set argument to long
Change-Id: Ie9b5978028000236ae5846214a2ea14fcdffaf56
2020-03-03 13:23:29 +09:00
cec6f24559 PMU register support for cpufreq driver.
Change-Id: I11462d25ef83867ddf2e643798d1e3d0257f7f33
2020-03-02 07:14:27 +00:00
b3b8283f87 Add NEWS.md
Change-Id: Iecf193e3d5dac57f87ef8db2f43add5fb99f6a6e
2020-02-27 06:13:25 +00:00
d62f80a7c0 spec: Prevent rpmbuild from including build-id directories into package
Change-Id: Ie935d684eed3780f79f29a588233f5ab54a5f5d7
2020-02-25 10:44:08 +09:00
6d584feaef spec: Add mckernel-devel package
Change-Id: I51e9b88ed18b5a0662d1d77e344b84cb14e2189e
2020-02-25 10:44:08 +09:00
e2e015e120 spec: Remind that kernel-rpm-macros is no longer included in kernel-devel in RHEL-8
Change-Id: I4fb6a2d5f9114d9947b0eb848a21f772a2bece5e
2020-02-25 10:44:06 +09:00
5fb3abe87b spec: Relax Linux kernel version requirement for RHEL-8
Eliminate the need for rebuilding rpm for every RHEL-8 errata release.

Change-Id: I483c22d0b578809117a4f56881b11e51fcc608a7
2020-02-25 10:42:19 +09:00
37fd9e0cd2 test: rt_sigtimedwait: Add test cases for SIG_IGN and real-time signal
Change-Id: I4abafe73d81cfa77167289477ea8c5af701e7f2e
Refs: #1378
Refs: #1440
2020-02-20 04:31:08 +00:00
7e748b4ecb rt_sigtimedwait: could not wait for realtime signal
Change-Id: I341d2f0c9657c3b14eae89dddba074b68c654a12
Refs: #1440
2020-02-13 06:23:22 +00:00
cafb46efc7 rt_sigtimedwait: could not wait for ignored signal
Change-Id: I0f5a8e2eaae2b7c08a01f4ebb2c405b8972269a2
Refs: #1378
2020-02-13 06:23:22 +00:00
41ea9d16c4 mremap: Fix to work correctly when old_page is large_page
Change-Id: I5a589383644a8098d910e49cd7ade6df325e0366
Refs: #1383
2020-02-13 06:15:25 +00:00
4bbdee395e ptrace: fix execve and return value handling (fixes strace on aarch64)
Change-Id: Icb5cb7f7e99fdb74a8628bc6b550688df5fb056b
2020-02-10 07:45:06 +00:00
597baf8445 eclair: support for live debug
Change-Id: Ia9bc126e198ba4a80722529ce09de5eb0775d429
2020-02-10 07:45:06 +00:00
55faba77a5 dump: rewrite NMI handling (for resume) and fix PANIC register saving
Change-Id: I360e9aa8efa64b6ebd99b209a5dd4ee0dc7806cf
2020-02-10 07:45:01 +00:00
6bef773741 eclair and ldump2mcdump: obtain PHYS_OFFSET from dump_mem_chunks
Change-Id: I5dd5f9e7e6b5817e50b0a1855b67f163d3029f17
2020-02-10 07:42:23 +00:00
7882110e9f eclair: obtain MAP_KERNEL_START from kernel image
Change-Id: I946c640ddb2e2b32362760254a86c611517becf3
2020-02-10 07:16:06 +00:00
d1df17ffb7 eclair: fix register GDB response for descheduled threads
Change-Id: I0001d094b624bc03f2b178ec28a4cab51e2acaf0
2020-02-10 07:16:06 +00:00
72af689e69 mcexec: detect mismatch of mcexec -n and mpirun -ppn
Change-Id: Iaf5cfb11c37bea6957b77a2114f783e9a46a48f2
Refs: #929
2020-02-05 06:39:57 +00:00
153d0609de ihk_os_{read,write}_cpu_register: Add async support
Change-Id: Ia2a2098550e856eeffbb20d8d0e0bcd57b85b6d7
2020-01-31 12:40:43 +09:00
83bbb87a0f mbind: fix processing when new range ovarlaps existing range(s)
Change-Id: I240a0205f0d836e4ff1a16b6739a3b366543bc06
Refs: #1384
2020-01-23 11:27:15 +09:00
f00d03445c epoll_pwait, ppoll, pselect: add to process sigmask
Change-Id: I6aa1db3b4c6ad81a8b5926fa87fc645269b103b6
Refs: #1361
2020-01-09 06:54:23 +00:00
911b07f507 fix: fork's race-condition caused by child and grand-child
Refs: #1329
Change-Id: Ia2d7641d1203f40155fef5db718d1bb2c583c1c5
2020-01-09 06:33:13 +00:00
5b26fe2956 do_process_vm_read_writev(): access local vector buffer using kernel virtual, PF if necessary
Change-Id: Ic90dca79e32d4151f585a5cbd5b2c7710534db0e
2019-12-23 02:54:52 +00:00
1db00ebc04 release_process_vm: free vm_range_numa_policy
Change-Id: I8084cd60a12b557b635b8e350f70d4e4f95d4c52
Refs: #1101
2019-12-20 07:12:16 +00:00
d5de68e97b eclair and crash: clean up architecture dependent codes and comply with Linux page_offset_base
Change-Id: Ie14ceb8bc9d816a9201dddd4020e2c21d6cfd686
Fujitsu: POSTK_DEBUG_ARCH_DEP_34
2019-12-18 01:53:29 +00:00
1526237bc6 x86 memory: use page_offset_base from linux
rhel 7.5 and later kernels have a page offset that is no longer
necessarily 0xffff880000000000, leading to kernel panics if we
use the wrong address

Change-Id: I3572fde1c31303a937855c23fbd3815ce0f96c64
2019-12-17 08:05:38 +00:00
b8d96a74ce Fix "arm64: Opt-out NMI for ThunderX2"
Change-Id: I95fabd17bfbae32320ed9e7a520c12e6f9527351
2019-12-17 14:48:10 +09:00
3c256e1a6c overlay: getdents: support lseek
Refs: #1421
Change-Id: Ife7ab1b50159a5897552ff695bb001ada27ec934
2019-12-13 03:49:20 +00:00
7fc4272b89 handle execveat systemcall on McKernel
Refs: #1366
Change-Id: I921e04a0df8d0d798fc94f675e5112dd2fec190a
2019-12-06 09:33:13 +09:00
d052acab1d xpmem: Fix deadlock in xpmem_remove_process_memory_range()
Refs: #1330
Change-Id: Ib62e3a7fe2811577ba8cabf174f64827e65c422c
2019-12-06 09:32:51 +09:00
91ea69cf8f xpmem: Make sure vm_range is used under memory_range_lock
Refs: #1330
Change-Id: I87a0d6042a2c388fbd260d8dff5d109106478872
2019-12-06 09:32:28 +09:00
0c63a2a3cd xpmem: Use correct process_vm in xpmem functions
Change-Id: I94c06ec69d0fe1e07d0b14bb44b448bbc63b9b63
2019-12-06 09:31:16 +09:00
a8696d811d xpmem: Support large page attachment
Change-Id: I4d672eee1c905160ece204d278f0afd9b6d7dc01
Refs: #1259
2019-12-06 09:30:51 +09:00
569dc33a9c mmap: fail and set -ENODEV when map to unmappable special file
mappable special files are /dev/mem and /dev/zero

Change-Id: Id1d4317104f901644e565007913e320d287e376f
2019-12-05 07:22:17 +00:00
4b252a990f SIGCONT: don't terminate process
Change-Id: Ib959a9e5341fda37bd055724ecb9319a469b7420
Refs: #1410
2019-12-05 07:13:56 +00:00
adb6cce3ce The process sending SIGCONT resumes the stopped process.
Change-Id: I64ee10172b99aa58540ffe8e9dd80fa0a64f4d01
Refs: #1420
2019-12-05 07:13:56 +00:00
ed21b6849d procfs: if memory_range_lock fails, process later
Change-Id: I3c5f24548455a63d8d5a4482f5081347f631885a
Refs: #452
2019-12-05 07:08:13 +00:00
37605740a4 support for backlog
Change-Id: Id8f503234e7afaa284e6b97dc264eb3a2af145c7
2019-12-05 07:08:13 +00:00
e069694c12 mem: Fix condition of whether in McKernel
Refs: #1324, #1329
Change-Id: I72bd69dbe65928f083b24513d50d29cabf3d6dff
2019-12-02 03:12:29 +00:00
dca1cb2625 arm64: Opt-out NMI for ThunderX2
Change-Id: I064da55e7e09e6d248c92ece5c56f9a9770c84a0
2019-11-28 02:22:55 +00:00
caac060684 mcctrl_getrusage: Round up cpuacct_stat_{system,user}
Change-Id: Ic1a236865fb3224dc9716c40a1eeb279c1fa1d70
2019-11-28 02:21:47 +00:00
d330721421 Rename struct cpu_topology to mcctrl_cpu_topology
To use a different name than the name in Linux kernel.

Change-Id: I44d10279195dfc9cfdc4788914b7d65b78292921
Fujitsu: POSTK_DEBUG_ARCH_DEP_40
2019-11-28 02:21:13 +00:00
157eeca41a README.md: Add contact
Change-Id: I3b038780ce91325151dfaef806e43eaaf71fe7e7
2019-11-28 02:09:41 +00:00
8ba725b225 mcstop+release.sh: Continue when releasing CPUs failed
Change-Id: Ib947843006ae9caa602e7b55309e68365edf4b2a
2019-11-28 02:09:01 +00:00
a563d780c1 munmap: fix deadlock with remote pagefault on vm range lock
Add similar protection to clear_host_pte than to set_host_vma (see #986)

Also make the page fault handler only skip taking lock if the munmap
happened on the same cpu id

Change-Id: I6d9e68e8f8905b20bb2ccfa72848e04fe6404ab6
2019-11-28 02:07:45 +00:00
621533bbd3 Add ENABLE_PERF macros so that perf support can be toggled
Change-Id: Ic50c8b329af63e63579b6a60b9557344100eaac4
2019-11-26 09:15:05 +09:00
37ea770f8c mmap: Round up map size by pagesize when specified MAP_HUGETLB
To match the behavior of Linux.

Change-Id: I7bcc2cb3c1e678ffc28f6b825c7a55032441dded
2019-11-14 07:24:25 +00:00
edd3ea0103 Revert "memory_range_lock: Enable interrupt when trylock fails"
This reverts commit 0d3ef65092.

Reason for revert: This fix causes circular dependency with memory_range manipulation and TLB flush. See #1394.

Change-Id: I4774e81ff300c199629e283e538c0a30ad0eeaae
2019-11-11 15:28:08 +09:00
41d37bcd30 mcstop+release: argument for rmmod path specification
Change-Id: I80e4e7136a90bc65050ab8f7d39615581c47f317
2019-10-03 13:58:20 +09:00
309145587f perf_event_open: Add support for counting REF_CPU_CYCLES
Using thread's tsc count instead of performance counter

Refs: #1025
Change-Id: I1d7a18f1c52f1d52087002d31818638a6b206014
2019-09-26 07:38:04 +00:00
bc06d68d84 sigsuspend: Make sure receive correct sigevent from do_kill
Change-Id: Ife9cf36a81f353e0575f6802f1e56f7dd4cb0425
Fujitsu: POSTK_DEBUG_TEMP_FIX_33
Refs: #1350
2019-09-26 07:34:34 +00:00
18412616e1 munmap: Change permission of VMA back to RWX on unmap
Change-Id: Ic02098e7458dd8fa2961fb03dc32e37fb18c5dc5
Refs: #988
2019-09-26 03:49:50 +00:00
c371fbf13b file map: cause SIGBUS when access to a page beyond EOF
Change-Id: Iaf7d792413e674267fd1c05c382212c8f67d8f5b
Refs: #1291
2019-09-26 03:41:23 +00:00
1492f16d67 make syscall_enter arch-dependent
Change-Id: I4317f3443902620ef5b3807ced05c80fa5eebbec
Fujitsu: POSTK_DEBUG_ARCH_DEP_90
Refs: #1357
2019-09-26 03:28:57 +00:00
fd38ab6fd0 Add test results for "syscall offload regardless of mcexec life and death"
Change-Id: Iee759ae8814aff4274ff81dc14f6d5d7a01494c5
Refs: #1321
2019-09-26 03:26:20 +00:00
f115bae8a7 include interrupt handling time into system time
Change-Id: If2ed2d488b4040d288d712f0a244505adbcec6f5
Refs: #1221
2019-09-26 03:21:28 +00:00
ba80dd8650 arm64: Fix for ptrace instruction rewrite on thunder-x2.
- Fixed the problem that instruction rewriting by PTRACE_POKETEXT is not reflected.
   The cause is that the instruction cache was not flushed.

 - Add instruction chache flush in ptrace_report_signal().

Change-Id: Ie9d34d3d33e1fd85aef5fe419345d82c6ca781fb
2019-09-26 02:57:07 +00:00
06960a41d9 test: signalonfork+wait: update error_injection.patch
Change-Id: Ia27e9b2fa6ec757bb05229ba3bf76e5e3bd43e5e
2019-09-26 02:34:46 +00:00
86a2aabb24 test: perf_event: add log of ThunderX2 machine (apollo)
Change-Id: I27aa1e30abdf4ed640a80b4016bcf108262ce9e3
2019-09-26 02:16:08 +00:00
b4101d9c36 brk: Fall back to demand-paging only when physically contiguous memory is unavailable
Change-Id: Id5d937b2cab7de1ad8925c9b95d85fcb620df9c6
Refs: #1353
Fujitsu: POSTK_DEBUG_ARCH_DEP_60
2019-09-26 02:16:08 +00:00
ec31d72483 freeze: add freeze_thaw test
Change-Id: I31db80b89adca9ac354a96ad21073b269d8a0e24
2019-09-26 02:13:23 +00:00
83ade5cdcd freeze: ignore multiple freeze request.
Change-Id: Ib7a7c4677137446cf7f7b387d016bacc7f0e9620
2019-09-26 02:13:23 +00:00
dec133c1dd freeze: restore state with thaw request
Change-Id: I7d6efd2c47020bedb716b6bd72d8a72b874c3cb2
2019-09-26 02:13:23 +00:00
04a528ab27 freeze: no process create in freeze state
Change-Id: Ia9cb7b8fb22d1c9d6c5a3fcdbd2873ef22f27c9f
2019-09-26 02:13:23 +00:00
8e4073c2ca freeze: allow interrupts in frozen state
Change-Id: I1d502f828ab9f9c0e1223d021979ac3dcf4d0c25
2019-09-26 02:13:23 +00:00
ff982b8594 freeze: change freeze-thaw to normal interrupt
Change-Id: Ib4dbac28f0074595e92ef316945b37ef4bc18327
2019-09-26 02:13:23 +00:00
299d47abf5 fork: memory leak detection test.
Change-Id: I9c64f8fdaee15642b3d1d2d7d869927b0bcd6511
2019-09-26 01:56:16 +00:00
f2460695c4 fork: do_fork: free resources when an error is detected
Change-Id: I0a29bb2cf886228effb088afe97d1b614728f517
2019-09-26 01:56:16 +00:00
6ce5c754f3 fork: settid: return error code.
Change-Id: I0678c266d8608b6d557b2b1e29e59bd6861314b8
2019-09-26 01:56:16 +00:00
e932f2e70c fork: release_thread: fix release of cloned thread
Change-Id: I390093bdb47a348cfec287cceaff22712df36bd9
2019-09-26 01:56:16 +00:00
bb08742467 fork: clone_thread: free resources when an error is detected
Change-Id: I922f3fddc35942ef2c67db6673980770731dced9
2019-09-26 01:56:16 +00:00
3e9fdfc0f1 fork: copy_user_ranges: rollback on error
Change-Id: Icdb8399cbce31835abcaeb783dde3ff14d30af6a
2019-09-26 01:56:16 +00:00
58f4593478 fork: fpregs: return error code.
Change-Id: I6ff150a39cd8952adad9b21d0c9f8514126ef957
2019-09-26 01:56:16 +00:00
de0e07f29e schedule: Skip save_fp_regs when the process ends
Change-Id: I32ff71a0dfcd7196d2c9e6cc1d68210933470bbb
Fujitsu: POSTK_DEBUG_ARCH_DEP_106
Refs: #1354
2019-09-25 06:43:08 +00:00
a4b83dc6d4 eclair: use snprintf instead of sprintf to prevent buffer-overrun
Change-Id: I2a27cffe303201e1738f115258f6e02058dbc63d
Refs: #1356
Fujitsu: POSTK_DEBUG_ARCH_DEP_38
2019-09-25 06:38:55 +00:00
beac6c3e80 make checking write-combine arch-dependent
Change-Id: I4c0fca7d34e69b4774141e115b8ebc03c5c1e8b3
Fujitsu: POSTK_DEBUG_ARCH_DEP_12
Refs: #1355
2019-09-23 16:42:26 +09:00
5d6715078f fix: madvise changes only the first one of vm_ranges
Change-Id: I83248c1162e28c3c24ca5f6b0933e1a8ca434d6b
Fujitsu: POSTK_DEBUG_TEMP_FIX_37
Refs: #1351
2019-09-08 14:22:00 +09:00
0615a0b00b procfs: mem: Change permission to 0600
It's 0400 in RHEL-5 and 6, but changed to 0600 in RHEL-7 and 8.

Change-Id: I9fb229e4c447eaa4570b1e2619c4fe039c07c86d
2019-08-19 01:17:03 +00:00
51cd7cbb6c arm64: rusage: Fix counting contiguous PTEs
Change-Id: I7e89c25d49dc1f6efe1c27c76c66c6fedd22af1f
Refs: #1342
2019-08-16 03:55:29 +00:00
0c1cae45fe coredump: Support signal number
Change-Id: If220bcd0865569a566e08aa53cae748fdc6317d0
Refs: #1340
2019-08-08 13:44:15 +09:00
11ef2f8092 coredump: Support threads
Change-Id: Id75ade6c87b15abcff5d772d90f77950376a32c1
Refs: #1219
2019-08-09 04:00:15 +00:00
12aef0b578 arm64: mcctrl: Fixed to search vdso_offset_sigtramp dynamically.
Change-Id: Iab5459194ca5281a1680a7fc26ae8bfaf1945a13
Refs: #1341
2019-08-08 00:48:22 +00:00
9b3450ee7e syscall offload regardless of mcexec life and death
Change-Id: I7db089993d3ee5ae6032f5085db2b67cef99fdfb
Refs: #1321
2019-08-08 00:39:26 +00:00
0d3ef65092 memory_range_lock: Enable interrupt when trylock fails
Also use read-write-lock

Change-Id: I03150b7208325ec1fe422dcd5f931e4e41c8e40e
Refs: #452
2019-08-08 00:38:55 +00:00
258156b57e support for read/write-lock and read/write-trylock
Change-Id: I609071c0f6234d0d413c8b312d8a8379abf6846e
Refs: #1323
2019-08-08 00:38:55 +00:00
8efced7bf7 mmap: Check if size exceeds available memory when MAP_HUGETLB
If size exceeds, mmap fails and set -ENOMEM

Change-Id: I4f0d6e18ee3a7c8e32e251b7ed07ee9f76305603
Refs: #1183
2019-08-08 00:31:36 +00:00
2dd8687974 flush instruction cache at context switch time if necessary
Change-Id: Ic09415ea772a9de6dca43a98168a8346ca86d3e7
2019-08-08 00:29:47 +00:00
f0bc1a6b07 cmake: Add option for "mem: per-CPU allocator cache (ThunderX2 workaround)"
Change-Id: I7156cf433b2081246d1d9b8e4fde489609676ef1
2019-08-08 00:29:34 +00:00
c52370b959 test perf_event: minor fixes(add signal handling. etc.)
Change-Id: I837d962bcaf13d3a523f80ff77f75b7fd51a98b7
2019-08-05 16:00:22 +09:00
9c78d4d249 pmu: define event validation in architecture dependent code.
Change-Id: Ia053af146ba3c89810892271cae93def6d9fd7c8
2019-07-31 16:18:50 +09:00
b6285c9aa9 pmu: Use bitmap instead of index to specify counters / events
Let the software index (or number) same as the hardware index at the
same time.

Change-Id: I847180e94bf2c57644ae2f8f571cdb4a84eac991
2019-07-31 16:17:20 +09:00
b945367c90 pmu: add ihk_mc_perfctr_value function
Change-Id: I88d25586dd470737a3eac4c3a4f1955ae6e41d64
2019-07-23 16:20:17 +09:00
0f434288e1 pmu: change to atomic register access.
Change-Id: Iebbdb8ca97e7a73f9d74138650ae18ce3a0f2605
2019-07-23 16:20:16 +09:00
b5cd813229 pmu: remove comment
Change-Id: If5819ce6f665c668f1f29724a814770957df0de0
2019-07-23 16:20:16 +09:00
7268942c35 pmu: implement ihk_mc_perf_get_num_counters.
Change-Id: I752103aedd9201fc00bda11228ca0bcf5103f12d
2019-07-23 16:20:16 +09:00
f8cad24a9a pmu: move cpu cycle event type comparison to arch dependent code.
Change-Id: If069f8893fe59e3517569b74b3a27b5267ebac03
2019-07-23 16:20:16 +09:00
2b6b3f31e5 pmu: remove pmc_{init|start|stop|reset} system call
Change-Id: I6eb65ed8c18558418c7aabfee75cd1974f4c03ff
2019-07-23 16:20:16 +09:00
ca19ee434a fix: Bug for perf_event_open error code.(LTP:perf_event_open01)
Change-Id: Ia7c942cb3c94ad5e6a0d8640f321f427cd1cd5f9
2019-07-23 16:20:16 +09:00
bb2589bac4 uti: futex_wait: Use kmalloc area for wait queue
Change-Id: Ida994c87334f9613bbf5cbda45b6b5474fd4c6be
2019-07-23 04:53:51 +00:00
e1c6e17400 uti: Use only general registers in libmck_syscall_intercept.so
Change-Id: I8e8e98bdc7e621aa111c0940d915ebe1775a10c3
2019-07-23 04:53:06 +00:00
207eba93ea uti: syscall_backward: Use kmalloc area to pass syscall arguments
Change-Id: I478a9b40b75f3d1d68c4446810a6236fe2f3a96c
Fujitsu: POSTK_DEBUG_ARCH_DEP_106
Refs: #1320
2019-07-22 03:52:44 +00:00
06af2d62c6 pmu: implement event mapping function.
Change-Id: Iac1ec99152b17a19dba0bf1a35f07724b8abc5a1
2019-07-18 16:39:18 +09:00
3e267e24cb exec: Allocate necessary number of pages to argenv area
Change-Id: I298a0de2f4e34ed774e2db7d90167dbe0d35586e
Refs: #1174
2019-07-17 06:38:35 +00:00
e58e1c6e33 uti: cmake: Add include dir pointing to libsyscall_intercept_hook_point.h
Change-Id: Iaea58725a16722d867cb27ffb4d9347b8756f9f2
2019-07-16 04:25:51 +00:00
fb924ebb9d README.md: update packages and git URL
Change-Id: I895dbece58a0ea69b39d1e07d8a16a22a2fed9a7
2019-07-08 04:24:37 +00:00
ac61577414 test: rusage: Add test private-mapping device file
Change-Id: I8b298ce598c2a5560138a1b694ccc7204d4ebbde
2019-07-05 01:18:35 +00:00
4cee9b1a27 rusage: Add comment on counting COW-source pointed-to by only fileobj
Change-Id: I082f6738dd29257c05e8a0e4b0af23dd8ffab449
2019-07-05 01:15:47 +00:00
b55e164669 page_fault_process_memory_range: Disable COW for VM region with zeroobj
This fixes ostest-mem_limits.001 which tries to anonymous-mmap 95% of
total memory. It reports a failure because:
(1) McKernel tries to allocate physically contiguous area and
    fails
(2) It turns on demand-paging
(3) It tries to obtain a page from zeroobj and fails
(4) It allocates a new page
(5) It performs COW on the page, which is unnecessary

Change-Id: Iddf0548bb9216f9bf91fb03fa21f890e599bfdad
2019-07-04 13:58:22 +09:00
aa66fe2cb1 extend_process_region: Fall back to demand paging when not contiguous enough
This fixes ostest-mem_limits.005 which tries to move brk by 95% of
total memory. It reports a failure because McKernel tries to allocate
physically contiguous area and fails.

Change-Id: I50a61cb7103fdbdbe051f0ae276a79e8e2dcdda3
2019-07-03 07:49:45 +00:00
3b74b0a093 rusage: Move pgsize_to_pgshift to arch-memory.h
Change-Id: Ia10b6e5c7d078d345347a79a3e98c06c16d28d6a
2019-07-02 09:10:04 +00:00
0267a0c8ea procfs: Fix type of number of threads
Change-Id: I7d5d17ae1e619d789cdb843f183be640efdbe9e2
Refs: #1277
2019-06-11 16:51:31 +00:00
b3b7801d51 overlay: fix /proc/PID/task/ corner cases
Change-Id: I17086c684af4c665d0c228b4a65cdb232eccf602
2019-06-07 01:48:10 +00:00
10f1fe76db ARM: set_range_middle(): fix PT deallocation bug
Change-Id: Ic8c1e1193ae33d1ae81e0df362ae1a6944c6c3b2
2019-06-06 01:11:16 +00:00
089b443aaf mmap()/shmget(): use Linux default huge page size when not specified
Change-Id: I8a9e3bed65ac1902adfaeaa254597dd30f540319
2019-06-06 01:09:38 +00:00
e9955a4bba Make heap and stack private mapping
Change-Id: I4306566b3bbbe27d206c5518a2d36d117ba4ca9f
2019-06-05 15:21:20 +09:00
dc52c8a11a crash: use fix kernel mapping instead of module space on ARM
Change-Id: I2d32dac78fc241a89bc98f8c098d4e63c8593e79
2019-06-05 14:31:48 +09:00
bc4629dfb0 ARM: fix performance counters allocation
Change-Id: Ie6c8beacf268462064f59b063d9c7b635c906dc4
2019-06-05 14:31:43 +09:00
99fba2df1c mem: per-CPU allocator cache (ThunderX2 workaround)
Change-Id: I7694524c5e9674a6f7bfcd911f8b0dbbead7df5a
2019-06-03 01:22:03 +00:00
239c95449b x86: add SMP barriers
Change-Id: I7fb36bd3d26fa272697db7c92495ce5fba34aeba
2019-06-03 01:22:03 +00:00
9dfc139eae cmake: kmod: Fix cross compile decision
Consider "arm64" to be "aarch64".
It mistakenly considers cross-compilation when compiled through spack.

Change-Id: I914df482e21517adc1105512ea3d8919ef1577b1
2019-05-22 02:34:55 +00:00
bc81d362b4 madvise: MADV_HUGEPAGE, MADV_NOHUGEPAGE: Fix error check
* Returns -EINVAL except for hugeobj and shmobj
* Fixes ostest-madvise.012 and ostest-madvise.013

Change-Id: Id1f1d6cc0c81edd204228ce5f75b641985e70cee
2019-05-13 05:54:45 +00:00
90b6aec53d get_one_cpu_topology: Fix error-handling
Fix the error handling of the following two functions:
  ihk_device_get_cpu_topology: Returns NULL when not found,
                               valid non-NULL pointer when found
  get_cache_topology: Returns NULL when not found,
                      valid non-NULL pointer when found,
                      minus error number on error

Change-Id: Ied13a61d4ab0c314477c45ea659ff2b798ad97ee
Fujitsu: POSTK_DEBUG_TEMP_FIX_21
2019-04-25 01:53:30 +00:00
0887e0de6d x86_64: mcexec: Remove "#include <asm/prctl.h>" (again)
Change-Id: Iae78954d5b520907cd6a85058e3a9fc1b842999f
Fujitsu: POSTK_DEBUG_ARCH_DEP_77
2019-04-25 10:33:00 +09:00
2c5c47344d x86_64: mcexec: Remove "#include <asm/prctl.h>"
Change-Id: I441f7a1c2e23b927fcd065fefba3ef3617356c18
Fujitsu: POSTK_DEBUG_ARCH_DEP_77
2019-04-25 10:14:19 +09:00
b9f223ceca crash: mcvtop: print proper page sizes for ARM contiguous pages
Change-Id: I2f677e64c743776de491262613b1014fe2bb7a8e
2019-04-23 08:54:26 +00:00
6297181dcd crash: mcps: print both PID and TID
Change-Id: Iafac099b1d953642509711a972962894b6111984
2019-04-23 08:54:14 +00:00
80f964e44f rus_vm_fault(): cleanup and early exit on NULL access
Change-Id: I90b18988989d4e377ed9c35df6b2e6bcdddd13b6
2019-04-23 08:53:59 +00:00
cc07d6e017 mcctrl_get_per_thread_data: Un-inline
Change-Id: I881db244ca551b3ca232918cb0b4245776f17295
Fujitsu: POSTK_DEBUG_ARCH_DEP_56
2019-04-18 02:35:52 +00:00
07c517828d procfs: add number of threads to stat and status
Change-Id: I98dd0868b20e9a1725c7d6e4f8379a4d86769780
2019-04-18 02:20:27 +00:00
75e42badf4 procfs: pagemap: Return EINVAL for unaligned offset
Change-Id: I2297818b0b31790b5452cb6f80dcba4192a7d120
2019-04-12 20:19:14 +09:00
bdccbf7356 MCS: fix ARM64 issue by using smp_XXX() functions (i.e., barrier()s)
Change-Id: I41470c082308c7c1ac91f88db2229958398d2e68
2019-04-10 20:26:13 +09:00
ad3ee26d36 Fix various issues in McKernel crash extension.
Determine V2PHYS_OFFSET dynamically.
Fix x86 hole handling in 64 bit address space.
Fix ARM64 virtual address handling and support separate user-space
and kernel-space translation tables (i.e., TTRB0 and TTRB1).
Fix page table walker's lookup functionality.

Change-Id: I6b281693cdc88bd1b8fe3f4b8f40a6af3ca95cc0
2019-04-09 01:52:49 +00:00
16f8ccb35b mcreboot: do not embed sudo when run as root
Change-Id: I59ebb4c72c12af8600a6d6d0eb13f6459ccf5bc2
2019-04-09 01:52:49 +00:00
3fda54ece8 IHK: support for using Linux work IRQ as IKC interrupt (optional)
Change-Id: I2a0e59a47c229fd9271866199c3c4d30e1ddd7f9
2019-04-09 01:52:49 +00:00
4d252c2bb2 map_fixed_area(): disable debug msg
Change-Id: Id6b3d001d908432c1adb6bba875e158a1424850d
2019-04-09 01:52:49 +00:00
0cf89c5682 Linux lockless linked list implementation
Change-Id: I8bd6ee989cecac269b55b3a0ff10cf8543629001
2019-04-09 01:52:49 +00:00
0d902872a1 x86: fix xchg() and cmpxchg() macros
Change-Id: I6faf0fff8a8595734fca6247634cdae6b86483b3
2019-04-09 01:52:49 +00:00
9b6a88eeeb x86_64: Move arch-specific interrupt vector number to arch-dependent code
Change-Id: Ie3cc631ec351503a619b019432388a827d75334c
Fujitsu: POSTK_DEBUG_ARCH_DEP_75
2019-04-08 01:48:07 +00:00
96b4729cd5 ihk_mc_map_virtual: Release virtual address range on error
It was telling the vmap allocator to release a wrong address range
(physical address range).

Change-Id: I82236ac0086b5da24ac49219166abf363672d838
Refs: #985
Fujitsu: #11
2019-04-08 00:43:55 +00:00
3372bbfd23 crash extension: port for ARM64
Change-Id: I47a4f13e96718e94c08ee8bc3e9b0be38d7a8a55
2019-03-29 07:55:28 +00:00
f17c30da07 do_mmap: give addr argument a chance even if not MAP_FIXED
hugectl relies on that to check if a range is free

Change-Id: I97963eef15c866f642e884b063b5caf5d827c776
2019-03-29 07:52:57 +00:00
9a0eb915fb Test "QLMPI (qlmpi_testsuite)" on arm64
Change-Id: I079fda2231ffb19b41fe86436d51ce9f83436c9b
2019-03-29 07:48:05 +00:00
a5ded1fc06 Add KNOWN_BUGS file
Document known major (e.g. linux crash) bugs that have not been
fixed downstream and might require workarounds on specific
hardware configurations

Change-Id: I51e5d23243afd4489ce1ae25e736afc27b2c8202
2019-03-29 07:47:28 +00:00
de042b2cb2 IPI: use logical CPU ids in ihk_mc_interrupt_cpu()
Also make remote TLB invalidation arch independent,
removes POSTK_DEBUG_ARCH_DEP_8.

Change-Id: I2b0fbcfa2bfe5da07607863e3e772d8e892e8525
2019-03-29 07:45:06 +00:00
2cee82673b test: perf_event_open: Fix test program
Change-Id: Ie5af8fb3ab7452078f2c35ec14c6369d86eedec3
2019-03-29 07:42:05 +00:00
dfb3bef96d irqbalance_mck: replace extra service with service drop-in
Using a drop-in instead of an extra service avoids having to juggle
between both services (especially since irqbalance_mck did not have a
Conflict=irqbalance.service statement)

That way, we only have a single service to check for (irqbalance.service),
and system administrators should find this less confusing if they normally
rely on irqbalance.

The drop-in is also installed in /run so will automatically disappear in
the event of a linux crash or a reboot without shutting down mckernel

Change-Id: I004f4f25d9ca037e411e0bc91f4555db138ecfef
2019-03-27 15:54:25 +09:00
2dc51530f3 mcreboot/mcstop+release: support for regular user execution
Change-Id: I9088f9c49bea13826bbab6348aa5560e6d91071b
2019-03-27 14:31:08 +09:00
13758417c5 Make boot scripts arch independent and move them to scripts
Change-Id: I3f4c3e366b325df17208a41d5f842c1a2a888494
2019-03-26 09:47:38 +00:00
c32edff2bb uti: rename x86-specific 'fs' to 'tls' + arm implem
Note: the original fujitsu implementation didn't rename the various
save_fs function/desc to save_tls for some reason, might as well go all
the way though...

Change-Id: Ic362c15c8b320c4d258d2ead8c5fd4eafd9d0ae9
Fujitsu: POSTK_DEBUG_ARCH_DEP_91
2019-03-22 16:38:29 +09:00
8356ef6c96 arm64: uti: Add arch-dependent helper for context switch
arm64 performs context-switch in kernel space instead of user space as in
x86_64.

Change-Id: Ib119b9ff014effb970183ee86cfac67fab773cba
Futjitsu: POSTK_DEBUG_ARCH_DEP_99
2019-03-22 06:52:21 +00:00
63d500515a mcexec: fix printf format warning
Some old commit before -Werror was enabled got merged,
blocking other builds. Quickly fix before anyone notices

Change-Id: I5a034cef6f79e3e99b381bb1a5d97088e33a6718
2019-03-22 05:25:34 +00:00
791e8c2114 Remove mcoverlayfs code
mcoverlayfs code is now unused (technically should work on top of the
soft emulation but not well tested, and untested unused code is bad).
Remove it.

Left the unshare/bind_mount_recursive code in mcexec in a new
MCEXEC_BIND_MOUNT ifdef (only in config.h.in directly to discourage use.
it disables the ioctl as well, but the main code is still compiled to
keep up to date with linux api changes... although it's using kallsyms
lookup so it does not validate much more than "the symbol still exists")

I honestly think this should go as well (people who would want to use it
are root and could do it manually), but will give up for now.

Change-Id: I832b6a8ab19e24ed67a1a5044b1c6c32381ae0aa
2019-03-22 05:18:43 +00:00
0bb612caea Fix test of getrusage fixes
* fix: Bug for getrusage return incorrect ru_maxrss
* fix: Bug for getrusage(RUSAGE_CHILDREN) return parent info
       (POSTK_DEBUG_TEIX_72)
* fix: Bug for getrusage often return incorrect ru_stime

Refs: #1032
Refs: #1033
Refs: #1034

Change-Id: Ifba95e4cb48ae551839819eb3abe26b37da4b196
2019-03-22 05:15:00 +00:00
5e992bc195 arm64: test: Add Makefile that was ignored commit.
Target commit:
  Test "Direct access to McKernel memory from Linux." on arm64
  Test "Scalable Vector Extension (SVE) support." on arm64

Change-Id: Ia9dc97c5cf0c4cf223423b4257745ea2101bee1d
2019-03-22 05:08:25 +00:00
08f817a654 page fault: clear writable bit for non-dirtying access to shared ranges
Change-Id: I3f3212b2aac79587f04450dfbdee9cb8a56bee04
Fujitsu: POSTK_DEBUG_ARCH_DEP_21
2019-03-22 05:03:03 +00:00
b87ac8b8c0 reproductible builds: remove most install paths in c code
In order to speed up test bot work it would be helpful to check for
identical build outputs and skip tests if required.

This removes most use of the install path in c code:
 - ql_mpi uses /proc/self/exe and looks for talker/server in same
directory as itself
 - mcexec looks for libihk.so in /proc/self/maps and use that path for
LD_PRELOAD prefix path
 - rootfsdir is not used right now but until a better fix happens just
hardcode it, someone who wants to change it can set it through cmake

There is one last occurence of the install directory, MCEXEC_PATH in
mcctrl's binfmt code, for which the build system will just overwrite it
to a constant string at build time instead of trying to remove it too
hard. It would be possible to pass it as a kernel parameter or look for
mcexec in PATH but this is too much work for now.

Change-Id: I5d1352bc5748a1ea10dcae4be630f30a07609296
2019-03-22 05:01:32 +00:00
a48a2cd3e8 add definition of util_register_desc system call number
Change-Id: I2047d33b5667761ce8399bad78eff6ab668b6ce4
2019-03-22 04:58:24 +00:00
7c238c27c9 uti: Check syscall number definition in hook()
Change-Id: I24d226199d03d23a12710ff1cad9fef29a6feedd
2019-03-22 04:58:04 +00:00
de77d2b061 add syscall_intercept.c to the mck_syscall_intercept
Change-Id: Iff8cfd2868118b6a9db7e24e4f00537251d1346c
2019-03-22 04:55:18 +00:00
52f89cf8fa add system call execution for uti
Change-Id: Ide79726b79964e72596ed78c87ec61d1eaf7e1c7
2019-03-22 04:54:34 +00:00
c96dfb0c68 mcstop: add -k to kill processses using /dev/mcos* before shutdown
Use lsof to check for processes that still open /dev/mcosX at shutdown
time.
If lsof is not installed then the check is just not done (empty PROCS
result)

If -k is not passed, print a message listing pids of users and exit
(taking bets someone will use that and sed to kill out of mcstop+release
and rerun the stop script instead of passing -k at some point)

Change-Id: Idba7486fdede4990d9885d23f8077f33839daeed
2019-03-22 04:33:33 +00:00
21c9e57646 page fault: use cow for private device mappings
Private device mappings still need copy-on-write to work, even if
there is no page.

Change-Id: I96e3e1eea81104f6b09bb7fda1105d9eeb489155
Refs: #1254
2019-03-22 04:30:55 +00:00
312b6c171b README.md: update package names
Change-Id: Ie4d37d724e60e8e473cb60db8a77b5b3a9681f4e
2019-03-19 02:20:38 +00:00
2ce695b47b proc: resurrect /proc/PID/stat and fix a few fields
Change-Id: I8ffcfde4db78c66ea10845a0451ae2610261f832
2019-03-18 20:33:29 +09:00
e5c1fdf129 MCS lock: make implementation arch independent
Change-Id: Ie5b2182555bbe1a11a005988db069d4b38f85401
2019-03-18 09:53:30 +00:00
9e3dd53c58 arm64: sve: coredump bug fix in non-sve environment.
Change-Id: I4cba5580b6367c67bef457c0273e9b70ad4a0756
2019-03-18 08:12:37 +00:00
fe53c6e0a5 Test "Process swap (swapout)" on arm64
Change-Id: I1eecb046575480966febbcb55e5f4ade6313275b
2019-03-18 08:12:14 +00:00
e988bfaf50 test: uti: Elaborate descriptions of CT12-20
Change-Id: Idfaa5fc3bfc7b65e24873f0c5e15c31a9d129420
2019-03-18 16:59:07 +09:00
f6f48b1210 Test "Direct access to McKernel memory from Linux." on arm64
Change-Id: I6e862146c3b591e671c526302bb1aad787f6bb83
2019-03-18 06:26:43 +00:00
70b42fde5d arm64: cmake: Add -mgeneral-regs-only option.
Change-Id: I0cbdc65c4b95195831344f4006bfc85b1ea58139
2019-03-12 17:26:18 +09:00
ccb36a5849 cmake: change how warning flags are added
Setting CMAKE_C_FLAGS_DEBUG does not work as first expected:
 - set(... CACHE) didn't do anything because the variables were
initialized previously
 - We could set with FORCE but then users could not change the value
 - There is a way to only do that on initial cmake run but it has the
same problem

Thus, use a new regular cache variable directly instead

Change-Id: I20741fb385c171c6c1088bbd6c25666067e07288
2019-03-08 17:22:20 +09:00
ea7f517e3d arm64: ptrace: Fix overwriting 1st argument with return value
Since arm64 shares the return value with the area of
the first argument, rewriting the return value before
the system call execution completes destroys the first argument.

Change-Id: I959944879254d8dd3a29489a65d8f274d45338e6
Fujitsu: POSTK_DEBUG_ARCH_DEP_110
2019-03-08 08:06:19 +00:00
ac18a24a27 arm64: fix phys_to_virt() calculation to be the same as Linux.
Change-Id: Ibbe17d33fd80eacff990b053fa17d8d320c227f1
2019-03-07 16:51:18 +09:00
8880710fad README.md: few minor updates
Change-Id: I7207ab2cf6ca5b69b464e0c41d2dd0ce3e80b674
2019-03-07 13:12:39 +09:00
03a85825ed copy_user_pte: base memobj copy on range & VR_PRIVATE
Some memobjs (e.g. devobj) will not be considered 'in memobj' by
page_is_in_memobj.
Instead of trying to play whack-a-mole with the non-fileobj memobjs,
base the copy check on range's memobj and VR_PRIVATE (do not copy
MAP_SHARED mappings, so the fault handler will do the right thing™
when required)

Change-Id: Ic32cdc7766754f6559753b34845eb8c5cff6ed13
Refs: #1255
2019-03-06 17:44:11 +09:00
940eeca6f5 x86 spinlock trylock: make next initializer old-gcc friendly
old gcc versions are stupid with nested structs and need us
to initialize .tickets.head and .tickets.tail in one go

Change-Id: I0d4caf8236066e7edf4a12e3270114132ced9585
2019-03-06 06:30:30 +00:00
19b02cf4ed arm64_cpu_capabilities: flatten struct
The midr_* part of the struct was never used, and confuses older gcc
with partially uninitialized assignments that were not correct.
Just flatten the struct

Change-Id: I7a9cfe064ab97cdcd5ac50ce4fb713c4d7983bd3
2019-03-06 06:30:30 +00:00
76a0cc71fc warnings: fix broken -Wmaybe-uninitialized
These variables cannot be used uninitialized, and newer gcc versions
correctly do not bring the warning up, but this will shut up older ones

Change-Id: I2b2ea9b557196a3e7eea1e04dd1f160bd12d6e54
2019-03-06 06:30:30 +00:00
ab39798181 send_syscall: remove unused variables
Change-Id: I0a350b8c7dbf27960544dd3651941d3905f93fc6
2019-03-06 06:30:30 +00:00
0cc3496747 warnings: fix missing field in initializer
use generic struct zero initializer instead.
Older gcc used on arm also seem to have trouble with '{}',
so use '{ 0 }' instead

Change-Id: I83d43b05f8d1d44e1dd86502b48e28fe242e1db2
2019-03-06 06:30:30 +00:00
10cca81401 arm64 vdso warning: fix non-void function not returning
arch_setup_vdso() needs to return something even on panic to please gcc.
In theory, flagging panic() with __attribute__((noreturn)) should work
just the same and is a much better solution but for some reason on older
gcc versions setting the flag leads to the weak memset() symbol not
being found !?

Change-Id: Ifed100df5440ca24bb495817db9afc79f0ba6751
2019-03-06 06:30:30 +00:00
0c79de67b4 warnings: disable override-init for arm perfctr arrays
The arrays first init every fields to invalid op then override a few
fields, since this is not something we want to allow everywhere use
a GCC pragma to only ignore the warning there.

Change-Id: I498546fe60d60d4b000d711e22e04c8c360b5b83
2019-03-06 06:30:30 +00:00
3fbad79afb warnings: init pte in process.c functions
pte_make_fileoff() on arm does not always init the pte, so just
init it to PTE_NULL firsthand

Change-Id: If195c1aef5b1344f13f6c0c76bb431a5fa339265
2019-03-06 06:30:30 +00:00
1b76aaa7e1 unused function warnings: add inline to static function in header files
Change-Id: I5d9bb539712a2b3e51c3ab3433a04fbb0cb0b961
2019-03-06 06:30:30 +00:00
aa3c5e91db arm64: Direct access to Mckernel memory from Linux.
Change-Id: I1a096aa5232c56382ae19d8c4e4f41d4e3e9f660
2019-03-06 14:53:16 +09:00
20d5900c35 mcstat: fix ihklib.h location
ihklib.h moved since it is no longer a generated file

Change-Id: I1ad6ff4bb8ae8c536d9ad7ee3cbeaf670ebcd11c
2019-03-01 06:24:39 +00:00
414cffd95b tests: remove calls to ihk_os_create/destroy_pseudofs
Change-Id: I04910c6a258c841437463e098fb8e02116c4f711
2019-03-01 06:24:04 +00:00
9ec0aeeab5 debug.h: merge both instances into ihk/debug.h
We do not need two debug.h files.

Take Fujitsu's STATIC_ASSERT over BUILD_BUG_ON because it is more used

Change-Id: If04c17fbb7406ab15fe86267fed8d6da460cec62
Fujitsu: POSTK_DEBUG_ARCH_DEP_9
2019-03-01 05:10:35 +00:00
06e96005a6 mcexec: restore --enable-vdso/disable-vdso for x86
Fujitsu added this ifdef together with ifndef __arch64__ and thus disabled
the option for both archs in practice; it probably does not hurt to restore...

I'm not sure I see the point of disabling the option at mcexec level though,
but who am I to care.

Change-Id: I0d4bffb6ed325edac8ae577773e19c0fff6ca2ed
Fujitsu: POSTK_DEBUG_ARCH_DEP_53
2019-03-01 05:08:45 +00:00
4606714c07 process stack: use PAGE_SIZE in aux vector
Don't ask me why this shares POSTK_DEBUG_ARCH_DEP_50 with the ksym lookups...

Change-Id: Ic3db2cd77ca88be361cefec85d8ed9deb21ffcd8
Fujitsu: POSTK_DEBUG_ARCH_DEP_50
2019-03-01 05:08:16 +00:00
a5d5baf8a8 rus_vm_fault: always use a packet on the stack
There are valid use cases where a remote page fault has no available
thread data/packet available to use, e.g. when device driver threads
need to access the data (BXI).

Do the per thread data lookup to use the right channel/tid if available,
and use mcctrl_ikc_send_wait with a new message number directly.

The fault is no longer handled in mckernel syscall forwarding code but
in the ikc handler directly in irq, this should be ok because page
faults are interrupts anyway so the code should be irq-safe.

Change-Id: Ie60f413cdaee6c1a824b4a2c93637899cb9bf9c9
2019-03-01 05:08:03 +00:00
8074445d59 README: fix background link in toc
Change-Id: Ief448fd99fddc310ea7f311798c94d0423ebf93a
2019-03-01 05:00:47 +00:00
6a456f11aa cmake: remove unused build-time symbol lookup
Everything already uses kallsyms_lookup_name or similar, this
was leftover from when the build system was ported ages ago

Change-Id: I09dd0249845df90ab2e0adc28d0eb285c0ebb64b
Fujitsu: POSTK_DEBUG_ARCH_DEP_50
2019-03-01 13:49:01 +09:00
81e665cb48 init_process: add missing initializations to proc struct
Change-Id: I4ea386ba3a8745202745bd8e35cab00c38262f65
Fujitsu: POSTK_DEBUG_ARCH_DEP_63
2019-03-01 04:39:59 +00:00
e0b9c5deec nanosleep: add cpu_pause() in spinwait loop
Probably some energy consumption saving?

Change-Id: I888f50568db8f08751abd0a002137c3b475362dc
Fujitsu: POSTK_DEBUG_ARCH_DEP_43
2019-03-01 04:38:51 +00:00
62772c8a24 gencore: Allocate ELF header to heap instead of stack
coredump() proceeds as follows:

1. coredump() calls gencore()
2. gencore() allocates ELF header to stack
3. gencore() prepares the core table and record the address of the ELF
   header to the table and return to coredump()
4. coredump() offloads __NR_coredump with the address of the core
   table

This fix prevents the ELF header from getting destroyed in the 3rd
step.

Change-Id: I770418c1658a6fdb640bb491fc076a31dfd41c22
Fujitsu: POSTK_TEMP_FIX_39
2019-03-01 04:38:28 +00:00
63d15f7dfc CMake Kbuild: fail at cmake time if KERNEL_DIR is missing
Change-Id: I66660718841d05003b87995d68bec728aa0db9ba
2019-03-01 04:38:05 +00:00
fb3f1c58a8 rpm: ignore CMakeCache.txt in dist and relax BuildRequires on cross build
CPack takes the source dir as is, so if it was used to build something
it will incorrectly grab the temporary CMakeCache file and cmake will
complain during rpmbuild later on.

The BuildRequires should be a separate patch but logic behind the change
is that the dependencies need to be installed in the sysroot, and
rpmbuild cannot test this, so just move them all to only enforce
BuildRequires for native build.

And while we are here, also add a new kernel_dir specfile option.

Change-Id: Ie67932798f632e6d307f8ead93bdbe043e6e8898
2019-03-01 04:37:46 +00:00
69846345de gencore: Zero-clear ELF header and memory range table
Change-Id: I0ff38c1b0e1e6ef204cb3605c0178848dbe40bfb
Fujitsu: POSTK_TEMP_FIX_63
2019-03-01 04:36:00 +00:00
b8155cc618 ihk submodule update: cpu/mem ioctl user access fix
Change-Id: If230c1012af5c1220e5927efba97a2ae38da42a0
2019-03-01 02:12:39 +00:00
f07e20a381 copy_user_pte: vmap area not owned by McKernel
Refs: #1166
Fujitsu: POSTK_DEBUG_TEMP_FIX_14
Change-Id: Iae0f1145d58ec2c14cecc14409b08a1db3b067b7
2019-02-28 07:50:16 +00:00
764948b51f test: Fix test programs for #1195
Add chmod 666 /dev/mcos0 for fork after setuid()

Refs: #1195
Change-Id: I2bec6a9a8378d246f50a9fc08a345b3235096a06
2019-02-28 00:57:22 +00:00
7da5fede8b Test "Scalable Vector Extension (SVE) support." on arm64
Change-Id: I3abaca932985a06b06887b962e769f2eac96c738
2019-02-27 06:26:00 +00:00
6810506c3d rusage: Fix available page sizes
Change-Id: I418075ff4b5341e0f5c7ff317e96461879a60f87
2019-02-22 14:08:18 +09:00
c82c2c1231 uti: Redirect uti thread futex() to McKernel do_futex()
Change-Id: I8203d0b60236e3ec72e22615a52907e1fff2c73c
2019-02-22 04:14:14 +00:00
5bc54a3bbe Fixed time processing.
- arm64: Get TSC corresponding to boot time from IHK.

- x86_64: Calculate the current time using vdso.

Refs: #1186
Fujitsu: POSTK_DEBUG_ARCH_DEP_52
Change-Id: I293ba4bbe5390d50dea44b8a5b7471f59237daff
2019-02-22 04:13:13 +00:00
07aa96ef95 arm64: Scalable Vector Extension (SVE) support.
Change-Id: I3568687913f583edfaa297d5cf5ac91d319d97e9
2019-02-22 04:07:29 +00:00
dac99f708c test: Add test programs for #1195
Refs: #1195
Change-Id: I21339f2597caf1704cc7d104e4bc5835d5270af6
2019-02-19 16:29:00 +09:00
f3c9fbf4ea rusage: Don't count PF_PATCH change
Fujitsu: POSTK_DEBUG_TEMP_FIX_86

Change-Id: Ia23f2d95c67062be3390acafad3e87f087466cdc
2019-02-18 14:50:56 +09:00
54122360e8 CMake: move CONFIG_ARM64_64K_PAGES and VA_BITS up to main CMakeLists
user code also needs these defines; there was a hard-coded
definition left out from debugging that didn't get cleaned up

Change-Id: I951fcd6a3d6bc1d1f1c3e897058908167520f7bc
2019-02-18 10:09:21 +09:00
734 changed files with 46344 additions and 26355 deletions

9
.gitignore vendored
View File

@ -13,6 +13,10 @@ old_timestamp
CMakeFiles
CMakeCache.txt
Makefile
!test/*/*/Makefile
!test/signalonfork+wait/Makefile
!test/perf_overflow/Makefile
!test/*/*/*.cmd
Kbuild
cmake_install.cmake
config.h
@ -33,3 +37,8 @@ executer/user/libmcexec.a
executer/user/libldump2mcdump.so
executer/user/eclair
tools/mcstat/mcstat
/_CPack_Packages
/CPackSourceConfig.cmake
CPackConfig.cmake
/build
mckernel-*.tar.gz

3
.gitmodules vendored
View File

@ -1,3 +1,6 @@
[submodule "ihk"]
path = ihk
url = https://github.com/RIKEN-SysSoft/ihk.git
[submodule "executer/user/lib/libdwarf/libdwarf"]
path = executer/user/lib/libdwarf/libdwarf
url = https://github.com/bgerofi/libdwarf.git

View File

@ -7,71 +7,157 @@ endif (NOT CMAKE_BUILD_TYPE)
enable_language(C ASM)
project(mckernel C ASM)
set(MCKERNEL_VERSION "1.6.0")
set(MCKERNEL_VERSION "1.7.0")
# See "Fedora Packaging Guidlines -- Versioning"
set(MCKERNEL_RELEASE "0.7")
set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/modules)
# for rpmbuild
if(DEFINED SYSCONF_INSTALL_DIR)
set(CMAKE_INSTALL_SYSCONFDIR "${SYSCONF_INSTALL_DIR}")
endif()
include(GNUInstallDirs)
include(CMakeParseArguments)
include(Kbuild)
include(Ksym)
include(CheckCCompilerFlag)
set(CFLAGS_WARNINGS "-Wall -Wextra -Wno-unused-parameter -Wno-sign-compare -Wno-unused-function")
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
if(IMPLICIT_FALLTHROUGH)
set(CFLAGS_WARNINGS "${CFLAGS_WARNINGS} -Wno-implicit-fallthrough")
endif(IMPLICIT_FALLTHROUGH)
# C flags need to be set before enabling language?
set(CMAKE_C_FLAGS_DEBUG "-g ${CFLAGS_WARNINGS}" CACHE STRING "Debug compiler flags")
set(CMAKE_C_FLAGS_RELEASE "${CFLAGS_WARNINGS}" CACHE STRING "Release compiler flags")
# build options
option(ENABLE_WERROR "Enable -Werror" OFF)
if (ENABLE_WERROR)
add_compile_options("-Werror")
endif(ENABLE_WERROR)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
set(BUILD_TARGET "smp-x86" CACHE STRING "Build target: smp-x86 | smp-arm64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
set(BUILD_TARGET "smp-arm64" CACHE STRING "Build target: smp-x86 | smp-arm64")
endif()
if (BUILD_TARGET STREQUAL "smp-x86")
set(ARCH "x86_64")
elseif (BUILD_TARGET STREQUAL "smp-arm64")
set(ARCH "arm64")
endif()
include(GNUInstallDirs)
include(CMakeParseArguments)
include(Kbuild)
include(CheckCCompilerFlag)
include(AutoconfHelper)
CHECK_C_COMPILER_FLAG(-Wno-implicit-fallthrough IMPLICIT_FALLTHROUGH)
if(IMPLICIT_FALLTHROUGH)
set(EXTRA_WARNINGS "-Wno-implicit-fallthrough")
endif(IMPLICIT_FALLTHROUGH)
# build options
set(CFLAGS_WARNING "-Wall" "-Wextra" "-Wno-unused-parameter" "-Wno-sign-compare" "-Wno-unused-function" ${EXTRA_WARNINGS} CACHE STRING "Warning flags")
add_compile_options(${CFLAGS_WARNING})
option(ENABLE_WERROR "Enable -Werror" OFF)
if (ENABLE_WERROR)
add_compile_options("-Werror")
endif(ENABLE_WERROR)
option(ENABLE_LINUX_WORK_IRQ_FOR_IKC "Use Linux work IRQ for IKC IPI" ON)
if (ENABLE_LINUX_WORK_IRQ_FOR_IKC)
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DIHK_IKC_USE_LINUX_WORK_IRQ")
add_definitions(-DIHK_IKC_USE_LINUX_WORK_IRQ)
endif()
if (BUILD_TARGET STREQUAL "smp-arm64")
foreach(i RANGE 1 120)
add_definitions(-DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i})
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DPOSTK_DEBUG_ARCH_DEP_${i} -DPOSTK_DEBUG_TEMP_FIX_${i}")
endforeach()
add_definitions(-DCONFIG_ARM64_64K_PAGES -DCONFIG_ARM64_VA_BITS=48)
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_64K_PAGES\" { print $2; exit; }" "${KERNEL_DIR}/.config"
OUTPUT_VARIABLE CONFIG_ARM64_64K_PAGES OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND awk -F= "$1 == \"CONFIG_ARM64_VA_BITS\" { print $2; exit; }" "${KERNEL_DIR}/.config"
OUTPUT_VARIABLE CONFIG_ARM64_VA_BITS OUTPUT_STRIP_TRAILING_WHITESPACE)
message("Host kernel CONFIG_ARM64_64K_PAGES=${CONFIG_ARM64_64K_PAGES}")
message("Host kernel CONFIG_ARM64_VA_BITS=${CONFIG_ARM64_VA_BITS}")
if(CONFIG_ARM64_64K_PAGES STREQUAL "y")
if(CONFIG_ARM64_VA_BITS STREQUAL 42)
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=2 -DCONFIG_ARM64_VA_BITS=42 -DCONFIG_ARM64_64K_PAGES)
set(LINKER_SCRIPT "smp-arm64_type3.lds")
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=48 -DCONFIG_ARM64_64K_PAGES)
set(LINKER_SCRIPT "smp-arm64_type4.lds")
endif()
else(CONFIG_ARM64_64K_PAGES STREQUAL "y")
if(CONFIG_ARM64_VA_BITS STREQUAL 39)
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=3 -DCONFIG_ARM64_VA_BITS=39)
set(LINKER_SCRIPT "smp-arm64_type1.lds")
elseif(CONFIG_ARM64_VA_BITS STREQUAL 48)
add_definitions(-DCONFIG_ARM64_PGTABLE_LEVELS=4 -DCONFIG_ARM64_VA_BITS=48)
set(LINKER_SCRIPT "smp-arm64_type2.lds")
endif()
endif(CONFIG_ARM64_64K_PAGES STREQUAL "y")
endif()
set_property(CACHE BUILD_TARGET PROPERTY STRINGS smp-x86 smp-arm64)
# define MAP_KERNEL_START
set(tmpdir ${CMAKE_CURRENT_BINARY_DIR}/tmp.resolve_MODULES_END)
file(REMOVE_RECURSE ${tmpdir})
file(MAKE_DIRECTORY ${tmpdir})
file(WRITE ${tmpdir}/driver.c "#include <linux/module.h>\n")
file(APPEND ${tmpdir}/driver.c "unsigned long MAP_KERNEL_START = MODULES_END - (1UL << 23);\n")
file(WRITE ${tmpdir}/Makefile "obj-m := driver.o\n")
file(APPEND ${tmpdir}/Makefile "all:\n")
file(APPEND ${tmpdir}/Makefile "\tmake ${KBUILD_MAKE_FLAGS_STR} -C ${KERNEL_DIR} M=${tmpdir} modules\n")
execute_process(COMMAND make -C ${tmpdir})
execute_process(COMMAND bash -c "offset=`readelf -S ${tmpdir}/driver.ko | grep .data | sed 's/.* //g'`; echo $((0x$offset))"
OUTPUT_VARIABLE MAP_KERNEL_START_OFFSET OUTPUT_STRIP_TRAILING_WHITESPACE)
execute_process(COMMAND bash -c "dd if=${tmpdir}/driver.ko bs=1 skip=${MAP_KERNEL_START_OFFSET} count=8 2>/dev/null | od -tx8 -Ax | head -1 | sed 's|.* |0x|g'"
OUTPUT_VARIABLE MAP_KERNEL_START OUTPUT_STRIP_TRAILING_WHITESPACE)
set(ENABLE_MEMDUMP ON)
option(ENABLE_PERF "Enable perf support" ON)
option(ENABLE_RUSAGE "Enable rusage support" ON)
option(ENABLE_MCOVERLAYFS "Enable overlay filesystem" OFF)
option(ENABLE_QLMPI "Enable qlmpi programs" OFF)
option(ENABLE_UTI "Enable uti support" OFF)
option(ENABLE_UBSAN "Enable undefined behaviour sanitizer on mckernel size" OFF)
option(ENABLE_PER_CPU_ALLOC_CACHE "Enable per-CPU allocator cache (ThunderX2 workaround)" OFF)
find_package(PkgConfig REQUIRED)
set(PKG_CONFIG_USE_CMAKE_PREFIX_PATH ON)
find_library(LIBRT rt)
if (NOT LIBRT)
message(FATAL_ERROR "error: couldn't find librt")
endif()
find_library(LIBNUMA numa)
if (NOT LIBNUMA)
message(FATAL_ERROR "error: couldn't find libnuma")
endif()
find_library(LIBBFD bfd)
if (NOT LIBBFD)
message(FATAL_ERROR "error: couldn't find libbfd")
endif()
find_library(LIBIBERTY iberty)
if (NOT LIBIBERTY)
message(FATAL_ERROR "error: couldn't find libiberty")
endif()
find_library(LIBDWARF dwarf)
if (NOT LIBDWARF)
if (CMAKE_CROSSCOMPILING)
message(FATAL_ERROR "Could not find libdwarf.so, install libdwarf-devel to ${CMAKE_FIND_ROOT_PATH}")
endif()
message("WARNING: libdwarf will be compiled locally")
enable_language(CXX)
else()
# Note that libdwarf-devel provides /usr/include/libdwarf/dwarf.h
# but elfutils-devel provides /usr/include/dwarf.h
# while mcinspect.c performs "#include <dwarf.h>"
find_path(DWARF_H dwarf.h PATH_SUFFIXES libdwarf)
endif()
if (ENABLE_QLMPI)
find_package(MPI REQUIRED)
endif()
if (ENABLE_UTI)
find_library(LIBSYSCALL_INTERCEPT syscall_intercept)
pkg_check_modules(LIBSYSCALL_INTERCEPT REQUIRED libsyscall_intercept)
link_directories(${LIBSYSCALL_INTERCEPT_LIBRARY_DIRS})
endif()
string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(-([0-9]+)(.*))?" "\\1;\\2;\\3;\\5;\\6" LINUX_VERSION ${UNAME_R})
@ -81,29 +167,11 @@ list(GET LINUX_VERSION 2 LINUX_VERSION_PATCH)
list(GET LINUX_VERSION 3 LINUX_VERSION_RELEASE)
math(EXPR LINUX_VERSION_CODE "${LINUX_VERSION_MAJOR} * 65536 + ${LINUX_VERSION_MINOR} * 256 + ${LINUX_VERSION_PATCH}")
ksym(sys_mount PREFIX MCCTRL_)
ksym(sys_umount PREFIX MCCTRL_)
ksym(sys_unshare PREFIX MCCTRL_)
ksym(zap_page_range PREFIX MCCTRL_)
ksym(vdso_image_64 PREFIX MCCTRL_)
ksym(vdso_start PREFIX MCCTRL_)
ksym(vdso_end PREFIX MCCTRL_)
ksym(vdso_pages PREFIX MCCTRL_)
ksym(__vvar_page PREFIX MCCTRL_)
ksym(hpet_address PREFIX MCCTRL_)
# POSTK_DEBUG_ARCH_DEP_50, add:find kernel symbol.
ksym(vdso_spec PREFIX MCCTRL_)
ksym(hv_clock PREFIX MCCTRL_)
ksym(sys_readlink PREFIX MCCTRL_)
ksym(walk_page_range PREFIX MCCTRL_)
# compat with various install paths
set(MCKERNEL_LIBDIR ${CMAKE_INSTALL_FULL_LIBDIR})
set(BINDIR ${CMAKE_INSTALL_FULL_BINDIR})
set(SBINDIR ${CMAKE_INSTALL_FULL_SBINDIR})
set(ETCDIR ${CMAKE_INSTALL_FULL_SYSCONFDIR})
set(ROOTFSDIR "${CMAKE_INSTALL_PREFIX}/rootfs")
set(ETCDIR ${CMAKE_INSTALL_PREFIX}/etc)
set(ROOTFSDIR "/rootfs")
if (CMAKE_INSTALL_PREFIX STREQUAL "/usr")
set(KMODDIR "/lib/modules/${UNAME_R}/extra/mckernel")
set(MCKERNELDIR "${CMAKE_INSTALL_FULL_DATADIR}/mckernel/${BUILD_TARGET}")
@ -138,23 +206,23 @@ configure_file(config.h.in config.h)
# actual build section - just subdirs
add_subdirectory(executer/kernel/mcctrl)
if (ENABLE_MCOVERLAYFS)
add_subdirectory(executer/kernel/mcoverlayfs)
endif()
add_subdirectory(executer/user)
add_subdirectory(kernel)
add_subdirectory(tools/mcstat)
add_subdirectory(tools/crash)
configure_file(arch/x86_64/tools/mcreboot-smp-x86.sh.in mcreboot.sh @ONLY)
configure_file(arch/x86_64/tools/mcstop+release-smp-x86.sh.in mcstop+release.sh @ONLY)
configure_file(arch/x86_64/tools/mcreboot.1in mcreboot.1 @ONLY)
configure_file(scripts/mcreboot-smp.sh.in mcreboot.sh @ONLY)
configure_file(scripts/mcstop+release-smp.sh.in mcstop+release.sh @ONLY)
configure_file(scripts/mcreboot.1in mcreboot.1 @ONLY)
configure_file(scripts/eclair-dump-backtrace.exp.in eclair-dump-backtrace.exp @ONLY)
install(PROGRAMS
"${CMAKE_CURRENT_BINARY_DIR}/mcreboot.sh"
"${CMAKE_CURRENT_BINARY_DIR}/mcstop+release.sh"
DESTINATION "${CMAKE_INSTALL_SBINDIR}")
install(FILES
"arch/x86_64/tools/irqbalance_mck.service"
"arch/x86_64/tools/irqbalance_mck.in"
install(PROGRAMS
"${CMAKE_CURRENT_BINARY_DIR}/eclair-dump-backtrace.exp"
DESTINATION "${CMAKE_INSTALL_BINDIR}")
install(FILES "scripts/irqbalance_mck.in"
DESTINATION "${CMAKE_INSTALL_SYSCONFDIR}")
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
DESTINATION "${CMAKE_INSTALL_MANDIR}/man1")
@ -162,7 +230,7 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/mcreboot.1"
configure_file(scripts/mckernel.spec.in scripts/mckernel.spec @ONLY)
set(CPACK_SOURCE_PACKAGE_FILE_NAME "${CMAKE_PROJECT_NAME}-${MCKERNEL_VERSION}")
set(CPACK_SOURCE_IGNORE_FILES "/.git$")
set(CPACK_SOURCE_IGNORE_FILES "/.git/;/build;/CMakeCache.txt$;/CMakeFiles$;/Makefile$")
set(CPACK_SOURCE_INSTALLED_DIRECTORIES "${CMAKE_SOURCE_DIR};/;${IHK_FULL_SOURCE_DIR};/ihk;${CMAKE_BINARY_DIR}/scripts;/scripts")
set(CPACK_SOURCE_GENERATOR "TGZ")
include(CPack)
@ -181,12 +249,14 @@ message("KERNEL_DIR: ${KERNEL_DIR}")
message("SYSTEM_MAP: ${SYSTEM_MAP}")
message("VMLINUX: ${VMLINUX}")
message("KBUILD_C_FLAGS: ${KBUILD_C_FLAGS}")
message("MAP_KERNEL_START: ${MAP_KERNEL_START}")
message("ENABLE_MEMDUMP: ${ENABLE_MEMDUMP}")
message("ENABLE_PERF: ${ENABLE_PERF}")
message("ENABLE_RUSAGE: ${ENABLE_RUSAGE}")
message("ENABLE_MCOVERLAYFS: ${ENABLE_MCOVERLAYFS}")
message("ENABLE_QLMPI: ${ENABLE_QLMPI}")
message("ENABLE_UTI: ${ENABLE_UTI}")
message("ENABLE_WERROR: ${ENABLE_WERROR}")
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
message("ENABLE_LINUX_WORK_IRQ_FOR_IKC: ${ENABLE_LINUX_WORK_IRQ_FOR_IKC}")
message("ENABLE_PER_CPU_ALLOC_CACHE: ${ENABLE_PER_CPU_ALLOC_CACHE}")
message("-------------------------------")

70
KNOWN_BUGS.md Normal file
View File

@ -0,0 +1,70 @@
Linux crash when offlining CPU (el7, hardware-specific)
=========================================================
On some hardware with el7 kernel, linux can crash due to a bug in the
irq handling when offlining CPUs (reserve cpu part of mcreboot)
Example stack trace:
```
[ 4147.052753] BUG: unable to handle kernel NULL pointer dereference at 0000000000000040
[ 4147.060677] IP: [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
[ 4147.068226] PGD 1057e44067 PUD 105f1e7067 PMD 0
[ 4147.072935] Oops: 0000 [#1] SMP
[ 4147.076230] Modules linked in: mcctrl(OE) ihk_smp_x86_64(OE) ihk(OE) xt_CHECKSUM ipt_MASQUERADE nf_nat_masquerade_ipv4 tun rpcsec_gss_krb5 nfsv4 dns_resolver nfs fscache ip6t_rpfilter ipt_REJECT nf_reject_ipv4 ip6t_REJECT nf_reject_ipv6 xt_conntrack ip_set nfnetlink ebtable_nat ebtable_broute bridge stp llc ip6table_nat nf_conntrack_ipv6 nf_defrag_ipv6 nf_nat_ipv6 ip6table_mangle ip6table_security ip6table_raw iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack iptable_mangle iptable_security iptable_raw ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter rpcrdma ib_isert iscsi_target_mod ib_iser libiscsi scsi_transport_iscsi ib_srpt target_core_mod ib_srp scsi_transport_srp scsi_tgt ib_ipoib rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm mlx4_ib ib_core
[ 4147.148619] dm_mirror dm_region_hash dm_log dm_mod sb_edac edac_core intel_powerclamp coretemp ext4 mbcache jbd2 intel_rapl iosf_mbi kvm_intel kvm irqbypass crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul ipmi_ssif glue_helper ablk_helper joydev iTCO_wdt iTCO_vendor_support cryptd ipmi_si ipmi_devintf ipmi_msghandler pcspkr wmi mei_me mei lpc_ich i2c_i801 sg ioatdma shpchp nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_en sd_mod crc_t10dif crct10dif_generic mgag200 drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm isci igb drm mlx4_core libsas ahci libahci scsi_transport_sas libata crct10dif_pclmul ptp crct10dif_common pps_core crc32c_intel dca i2c_algo_bit i2c_core devlink [last unloaded: ihk]
[ 4147.215370] CPU: 6 PID: 38 Comm: migration/6 Tainted: G OE ------------ T 3.10.0-693.2.2.el7.x86_64 #1
[ 4147.225672] Hardware name: SGI.COM C1104G-RP5/X9DRG-HF, BIOS 3.0 10/25/2013
[ 4147.232747] task: ffff880174689fa0 ti: ffff8801746ac000 task.ti: ffff8801746ac000
[ 4147.240278] RIP: 0010:[<ffffffff8102ce26>] [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
[ 4147.250275] RSP: 0018:ffff8801746afd30 EFLAGS: 00010046
[ 4147.255608] RAX: 0000000000000000 RBX: 000000000000004e RCX: 0000000000000000
[ 4147.262770] RDX: 0000000000000020 RSI: 000000000000005f RDI: 0000000000000023
[ 4147.269936] RBP: ffff8801746afd58 R08: 0000000000000001 R09: ffff88017f800490
[ 4147.277103] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000006
[ 4147.284269] R13: 0000000000000000 R14: ffff88085ca82500 R15: 000000000000005f
[ 4147.291429] FS: 0000000000000000(0000) GS:ffff88085fb80000(0000) knlGS:0000000000000000
[ 4147.299556] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 4147.305326] CR2: 0000000000000040 CR3: 0000001059704000 CR4: 00000000001407e0
[ 4147.312490] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 4147.319659] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[ 4147.326827] Stack:
[ 4147.328857] ffff8808f43078c8 ffff8808f4307850 0000000000000286 ffff8808f4307701
[ 4147.336384] 0000000000000000 ffff8801746afd70 ffffffff81052a82 0000000200000000
[ 4147.343915] ffff8801746afd88 ffffffff81693ca3 0000000000000003 ffff8801746afdc0
[ 4147.351447] Call Trace:
[ 4147.353921] [<ffffffff81052a82>] native_cpu_disable+0x12/0x40
[ 4147.359795] [<ffffffff81693ca3>] take_cpu_down+0x13/0x40
[ 4147.365236] [<ffffffff81116899>] multi_cpu_stop+0xd9/0x100
[ 4147.370850] [<ffffffff811167c0>] ? cpu_stop_should_run+0x50/0x50
[ 4147.376983] [<ffffffff81116ab7>] cpu_stopper_thread+0x97/0x150
[ 4147.382942] [<ffffffff816a8fad>] ? __schedule+0x39d/0x8b0
[ 4147.388461] [<ffffffff810b909f>] smpboot_thread_fn+0x12f/0x180
[ 4147.394406] [<ffffffff810b8f70>] ? lg_double_unlock+0x40/0x40
[ 4147.400276] [<ffffffff810b098f>] kthread+0xcf/0xe0
[ 4147.405182] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
[ 4147.411319] [<ffffffff816b4f58>] ret_from_fork+0x58/0x90
[ 4147.418893] [<ffffffff810b08c0>] ? insert_kthread_work+0x40/0x40
[ 4147.426524] Code: 81 fb 00 01 00 00 0f 84 8a 00 00 00 89 d8 65 44 8b 3c 85 20 c6 00 00 45 85 ff 78 e1 44 89 ff e8 91 31 10 00 48 63 15 7e 10 af 00 <48> 8b 70 40 48 c7 c7 80 71 cf 81 49 89 c6 48 83 c2 3f 48 c1 fa
[ 4147.450352] RIP [<ffffffff8102ce26>] check_irq_vectors_for_cpu_disable+0x86/0x1c0
[ 4147.460135] RSP <ffff8801746afd30>
[ 4147.465154] CR2: 0000000000000040
```
This bug has been fixed upstream, but redhat will not backport the fixes.
You can work around the problem with a kpatch by backporting the three
following commits:
x86: irq: Get correct available vectors for cpu disable
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=ac2a55395eddccd6e3e39532df9869d61e97b2ee
x86/irq: Check for valid irq descriptor in check_irq_vectors_for_cpu_disable()
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d97eb8966c91f2c9d05f0a22eb89ed5b76d966d1
x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable()
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=cbb24dc761d95fe39a7a122bb1b298e9604cae15
Alternatively, since it is related to the irq configuration, it might
be possible to mitigate the issue by setting the irq affinities early
on and making sure none of the cpus that will be offlined have any irq
configured.

540
NEWS.md Normal file
View File

@ -0,0 +1,540 @@
=============================================
What's new in version 1.7.0rc4 (Apr 15, 2020)
=============================================
----------------------
McKernel major updates
----------------------
1. arm64: Contiguous PTE support
2. arm64: Scalable Vector Extension (SVE) support
3. arm64: PMU overflow interrupt support
4. xpmem: Support large page attachment
5. arm64 port: Direct access to Mckernel memory from Linux
6. arm64 port: utility thread offloading, which spawns thread onto
Linux CPU
7. eclair: support for live debug
8. Crash utility extension
9. Replace mcoverlayfs with a soft userspace overlay
10. Build system is switched to cmake
11. Core dump includes thread information
------------------------
McKernel major bug fixes
------------------------
1. shmobj: Fix rusage counting for large page
2. mcctrl control: task start_time changed to u64 nsec
3. mcctrl: add handling for one more level of page tables
4. Add kernel argument to turn on/off time sharing
5. flatten_string/process env: realign env and clear trailing bits
6. madvise: Add MADV_HUGEPAGE support
8. mcctrl: remove in-kernel calls to syscalls
9. arch_cpu_read_write_register: error return fix.
10. set_cputime(): interrupt enable/disable fix.
11. set_mempolicy(): Add mode check.
12. mbind(): Fix memory_range_lock deadlock.
13. ihk_ikc_recv: Record channel to packet for release
14. Add set_cputime() kernel to kernel case and mode enum.
15. execve: Call preempt_enable() before error-exit
16. memory/x86_64: fix linux safe_kernel_map
17. do_kill(): fix pids table when nr of threads is larger than num_processors
18. shmget: Use transparent huge pages when page size isn't specified
19. prctl: Add support for PR_SET_THP_DISABLE and PR_GET_THP_DISABLE
20. monitor_init: fix undetected hang on highest numbered core
21. init_process_stack: change premapped stack size based on arch
22. x86 syscalls: add a bunch of XXat() delegated syscalls
23. do_pageout: fix direct kernel-user access
24. stack: add hwcap auxval
25. perf counters: add arch-specific perf counters
26. Added check of nohost to terminate_host().
27. kmalloc: Fix address order in free list
28. sysfs: use nr_cpu_ids for cpumasks (fixes libnuma parsing error on ARM)
29. monitor_init: Use ihk_mc_cpu_info()
30. Fix ThunderX2 write-combined PTE flag insanity
31. ARM: eliminate zero page mapping (i.e, init_low_area())
32. eliminate futex_cmpxchg_enabled check (not used and dereffed a NULL pointer)
33. page_table: Fix return value of lookup_pte when ptl4 is blank
34. sysfs: add missing symlinks for cpu/node
35. Make Linux handler run when mmap to procfs.
36. Separate mmap area from program loading (relocation) area
37. move rusage into kernel ELF image (avoid dynamic alloc before NUMA init)
38. arm: turn off cpu on panic
39. page fault handler: protect thread accesses
40. Register PPD and release_handler at the same time.
41. fix to missing exclusive processing between terminate() and
finalize_process().
42. perfctr_stop: add flags to no 'disable_intens'
43. fileobj, shmobj: free pages in object destructor (as opposed to page_unmap())
44. clear_range_l1, clear_range_middle: Fix handling contiguous PTE
45. do_mmap: don't pre-populate the whole file when asked for smaller segment
46. invalidate_one_page: Support shmobj and contiguous PTE
47. ubsan: fix undefined shifts
48. x86: disable zero mapping and add a boot pt for ap trampoline
49. rusage: Don't count PF_PATCH change
50. Fixed time processing.
51. copy_user_pte: vmap area not owned by McKernel
52. gencore: Zero-clear ELF header and memory range table
53. rpm: ignore CMakeCache.txt in dist and relax BuildRequires on cross build
54. gencore: Allocate ELF header to heap instead of stack
55. nanosleep: add cpu_pause() in spinwait loop
56. init_process: add missing initializations to proc struct
57. rus_vm_fault: always use a packet on the stack
58. process stack: use PAGE_SIZE in aux vector
59. copy_user_pte: base memobj copy on range & VR_PRIVATE
60. arm64: ptrace: Fix overwriting 1st argument with return value
61. page fault: use cow for private device mappings
62. reproductible builds: remove most install paths in c code
63. page fault: clear writable bit for non-dirtying access to shared ranges
64. mcreboot/mcstop+release: support for regular user execution
65. irqbalance_mck: replace extra service with service drop-in
66. do_mmap: give addr argument a chance even if not MAP_FIXED
67. x86: fix xchg() and cmpxchg() macros
68. IHK: support for using Linux work IRQ as IKC interrupt (optional)
69. MCS: fix ARM64 issue by using smp_XXX() functions (i.e., barrier()s)
70. procfs: add number of threads to stat and status
71. memory_range_lock: Fix deadlock in procfs/sysfs handler
72. flush instruction cache at context switch time if necessary
73. arm64: Fix PMU related functions
74. page_fault_process_memory_range: Disable COW for VM region with zeroobj
75. extend_process_region: Fall back to demand paging when not contiguous
76. munmap: fix deadlock with remote pagefault on vm range lock
77. procfs: if memory_range_lock fails, process later
78. migrate-cpu: Prevent migration target from calling schedule() twice
79. sched_request_migrate(): fix race condition between migration req and IRQs
80. get_one_cpu_topology: Renumber core_id (physical core id)
81. bb7e140 procfs cpuinfo: use sequence number as processor
82. set_host_vma(): do NOT read protect Linux VMA
===========================================
What's new in V1.6.0 (Nov 11, 2018)
===========================================
-----------------------------------------------
McKernel new features, improvements and changes
-----------------------------------------------
1. McKernel and Linux share one unified kernel virtual address space.
That is, McKernel sections resides in Linux sections spared for
modules. In this way, Linux can access the McKernel kernel memory
area.
2. hugetlbfs support
3. IHK is now included as a git submodule
4. Debug messages are turned on/off in per souce file basis at run-time.
5. It's prohibited for McKernel to access physical memory ranges which
Linux didn't give to McKernel.
6. UTI (capability to spawn a thread on Linux CPU) improvement:
* System calls issued from the thread are hooked by modifying
binary in memory.
---------------------------
McKernel bug fixes (digest)
---------------------------
#<num> below corresponds to the redmine issue number
(https://postpeta.pccluster.org/redmine/).
1. #926: shmget: Hide object with IPC_RMID from shmget
2. #1028: init_process: Inherit parent cpu_set
3. #995: Fix shebang recorded in argv[0]
4. #1024: Fix VMAP virtual address leak
5. #1109: init_process_stack: Support "ulimit -s unlimited"
6. x86 mem init: do not map identity mapping
7. mcexec_wait_syscall: requeue potential request on interrupted wait
8. mcctrl_ikc_send_wait: fix interrupt with do_frees == NULL
9. pager_req_read: handle short read
10. kprintf: only call eventfd() if it is safe to interrupt
11. process_procfs_request: Add Pid to /proc/<PID>/status
12. terminate: fix oversubscribe hang when waiting for other threads on same CPU to die
13. mcexec: Do not close fd returned to mckernel side
14. #976: execve: Clear sigaltstack and fp_regs
15. #1002: perf_event: Specify counter by bit_mask on start/stop
16. #1027: schedule: Don't reschedule immediately when wake up on migrate
17. #mcctrl: lookup unexported symbols at runtime
18. __sched_wakeup_thread: Notify interrupt_exit() of re-schedule
19. futex_wait_queue_me: Spin-sleep when timeout and idle_halt is specified
20. #1167: ihk_os_getperfevent,setperfevent: Timeout IKC sent by mcctrl
21. devobj: fix object size (POSTK_DEBUG_TEMP_FIX_36)
22. mcctrl: remove rus page cache
23. #1021: procfs: Support multiple reads of e.g. /proc/*/maps
24. #1006: wait: Delay wake-up parent within switch context
25. #1164: mem: Check if phys-mem is within the range of McKernel memory
26. #1039: page_fault_process_memory_range: Remove ihk_mc_map_virtual for CoW of device map
27. partitioned execution: pass process rank to LWK
28. process/vm: implement access_ok()
29. spinlock: rewrite spinlock to use Linux ticket head/tail format
30. #986: Fix deadlock involving mmap_sem and memory_range_lock
31. Prevent one CPU from getting chosen by concurrent forks
32. #1009: check_signal: system call restart is done only once
33. #1176: syscall: the signal received during system call processing is not processed.
34. #1036 syscall_time: Handle by McKernel
35. #1165 do_syscall: Delegate system calls to the mcexec with the same pid
36. #1194 execve: Fix calling ptrace_report_signal after preemption is disabled
37. #1005 coredump: Exclude special areas
38. #1018 procfs: Fix pread/pwrite to procfs fail when specified size is bigger than 4MB
39. #1180 sched_setaffinity: Check migration after decrementing in_interrupt
40. #771, #1179, #1143 ptrace supports threads
41. #1189 procfs/do_fork: wait until procfs entries are registered
42. #1114 procfs: add '/proc/pid/stat' to mckernel side and fix its comm
43. #1116 mcctrl procfs: check entry was returned before using it
44. #1167 ihk_os_getperfevent,setperfevent: Return -ETIME when IKC timeouts
45. mcexec/execve: fix shebangs handling
46. procfs: handle 'comm' on mckernel side
47. ihk_os_setperfevent: Return number of registered events
48. mcexec: fix terminating zero after readlink()
===========================================
What's new in V1.5.1 (July 9, 2018)
===========================================
-----------------------------------------------
McKernel new features, improvements and changes
-----------------------------------------------
1. Watchdog timer to detect hang of McKernel
mcexec prints out the following line to its stderr when a hang of
McKernel is detected.
mcexec detected hang of McKernel
The watchdog timer is enabled by passing -i <timeout_in_sec> option
to mcreboot.sh. <timeout_in_sec> specifies the interval of checking
if McKernel is alive.
Example: mcreboot.sh -i 600: Detect the hang with 10 minutes interval
The detailed step of the hang detection is as follows.
(1) mcexec acquires eventfd for notification from IHK and perform
epoll() on it.
(2) A daemon called ihkmond monitors the state of McKernel periodically
with the interval specified by the -i option. It judges that
McKernel is hanging and notifies mcexec by the eventfd if its
state hasn't changed since the last check.
2. Documentation
man page: Installed directory is changed to <install_dir>/share/man
---------------------------
McKernel bug fixes (digest)
---------------------------
1. #1146: pager_req_map(): do not take mmap_sem if not needed
2. #1135: prepare_process_ranges_args_envs(): fix saving cmdline
3. #1144: fileobj/devobj: record path name
4. #1145: fileobj: use MCS locks for per-file page hash
5. #1076: mcctrl: refactor prepare_image into new generic ikc send&wait
6. #1072: execve: fix execve with oversubscribing
7. #1132: execve: use thread variable instead of cpu_local_var(current)
8. #1117: mprotect: do not set page table writable for cow pages
9. #1143: syscall wait4: add _WALL (POSTK_DEBUG_ARCH_DEP_44)
10. #1064: rusage: Fix initialization of rusage->num_processors
11. #1133: pager_req_unmap: Put per-process data at exit
12. #731: do_fork: Propagate error code returned by mcexec
13. #1149: execve: Reinitialize vm_regions's map area on execve
14. #1065: procfs: Show file names in /proc/<PID>/maps
15. #1112: mremap: Fix type of size arguments (from ssize_t to size_t)
16. #1121: sched_getaffinity: Check arguments in the same order as in Linux
17. #1137: mmap, mremap: Check arguments in the same order as in Linux
18. #1122: fix return value of sched_getaffinity
19. #732: fix: /proc/<PID>/maps outputs a unnecessary NULL character
===================================
What's new in V1.5.0 (Apr 5, 2018)
===================================
--------------------------------------
McKernel new features and improvements
--------------------------------------
1. Aid for Linux version migration: Detect /proc, /sys format change
between two kernel verions
2. Swap out
* Only swap-out anonymous pages for now
3. Improve support of /proc/maps
4. mcstat: Linux tool to show resource usage
---------------------------
McKernel bug fixes (digest)
---------------------------
1. #727: execve: Fix memory leak when receiving SIGKILL
2. #829: perf_event_open: Support PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE
3. #906: mcexec: Check return code of fork()
4. #1038: mcexec: Timeout when incorrect value is given to -n option
5. #943 #945 #946 #960 $961: mcexec: Support strace
6. #1029: struct thread is not released with stress-test involving signal
and futex
7. #863 #870: Respond immediately to terminating signal when
offloading system call
8. #1119: translate_rva_to_rpa(): use 2MB blocks in 1GB pages on x86
11. #898: Shutdown OS only after no in-flight IKC exist
12. #882: release_handler: Destroy objects as the process which opened it
13. #882: mcexec: Make child process exit if the parent is killed during
fork()
14. #925: XPMEM: Don't destroy per-process object of the parent
15. #885: ptrace: Support the case where a process attaches its child
16. #1031: sigaction: Support SA_RESETHAND
17. #923: rus_vm_fault: Return error when a thread not performing
system call offloading causes remote page fault
18. #1032 #1033 #1034: getrusage: Fix ru_maxrss, RUSAGE_CHILDREN,
ru_stime related bugs
19. #1120: getrusage: Fix deadlock on thread->times_update
20. #1123: Fix deadlock related to wait_queue_head_list_node
21. #1124: Fix deadlock of calling terminate() from terminate()
22. #1125: Fix deadlock related to thread status
* Related functions are: hold_thread(), do_kill() and terminate()
23. #1126: uti: Fix uti thread on the McKernel side blocks others in do_syscall()
24. #1066: procfs: Show Linux /proc/self/cgroup
25. #1127: prepare_process_ranges_args_envs(): fix generating saved_cmdline to
avoid PF in strlen()
26. #1128: ihk_mc_map/unmap_virtual(): do proper TLB invalidation
27. #1043: terminate(): fix update_lock and threads_lock order to avoid deadlock
28. #1129: mcreboot.sh: Save /proc/irq/*/smp_affinity to /tmp/mcreboot
29. #1130: mcexec: drop READ_IMPLIES_EXEC from personality
--------------------
McKernel workarounds
--------------------
1. Forbid CPU oversubscription
* It can be turned on by mcreboot.sh -O option
===================================
What's new in V1.4.0 (Oct 30, 2017)
===================================
-----------------------------------------------------------
Feature: Abstracted event type support in perf_event_open()
-----------------------------------------------------------
PERF_TYPE_HARDWARE and PERF_TYPE_CACHE types are supported.
----------------------------------
Clean-up: Direct user-space access
----------------------------------
Code lines using direct user-space access (e.g. passing user-space
pointer to memcpy()) becomes more portable across processor
architectures. The modification follows the following rules.
1. Move the code section as it is to the architecture dependent
directory if it is a part of the critical-path.
2. Otherwise, rewrite the code section by using the portable methods.
The methods include copy_from_user(), copy_to_user(),
pte_get_phys() and phys_to_virt().
--------------------------------
Test: MPI and OpenMP micro-bench
--------------------------------
The performance figures of MPI and OpenMP primitives are compared with
those of Linux by using Intel MPI Benchmarks and EPCC OpenMP Micro
Benchmark.
===================================
What's new in V1.3.0 (Sep 30, 2017)
===================================
--------------------
Feature: Kernel dump
--------------------
1. A dump level of "only kernel memory" is added.
The following two levels are available now:
0: Dump all
24: Dump only kernel memory
The dump level can be set by -d option in ihkosctl or the argument
for ihk_os_makedumpfile(), as shown in the following examples:
Command: ihkosctl 0 dump -d 24
Function call: ihk_os_makedumpfile(0, NULL, 24, 0);
2. Dump file is created when Linux panics.
The dump level can be set by dump_level kernel argument, as shown in the
following example:
ihkosctl 0 kargs "hidos dump_level=24"
The IHK dump function is registered to panic_notifier_list when creating
/dev/mcdX and called when Linux panics.
-----------------------------
Feature: Quick Process Launch
-----------------------------
MPI process launch time and some of the initialization time can be
reduced in application consisting of multiple MPI programs which are
launched in turn in the job script.
The following two steps should be performed to use this feature:
1. Replace mpiexec with ql_mpiexec_start and add some lines for
ql_mpiexec_finalize in the job script
2. Modify the app so that it can repeat calculations and wait for the
instructions from ql_mpiexec_{start,finalize} at the end of the
loop
The first step is explained using an example. Assume the original job
script looks like this:
/* Execute ensamble simulation and then data assimilation, and repeat this
ten times */
for i in {1..10}; do
/* Each ensamble simulation execution uses 100 nodes, launch ten of them
in parallel */
for j in {1..10}; do
mpiexec -n 100 -machinefile ./list1_$j p1.out a1 & pids[$i]=$!;
done
/* Wait until the ten ensamble simulation programs finish */
for j in {1..10}; do wait ${pids[$j]}; done
/* Launch one data assimilation program using 1000 nodes */
mpiexec -n 1000 -machinefile ./list2 p2.out a2
done
The job script should be modified like this:
for i in {1..10}; do
for j in {1..10}; do
/* Replace mpiexec with ql_mpiexec_start */
ql_mpiexec_start -n 100 -machinefile ./list1_$j p1.out a1 & pids[$j]=$!;
done
for j in {1..10}; do wait ${pids[$j]}; done
ql_mpiexec_start -n 1000 -machinefile ./list2 p2.out a2
done
/* p1.out and p2.out don't exit but are waiting for the next calculation.
So tell them to exit */
for j in {1..10}; do
ql_mpiexec_finalize -machinefile ./list1_$i p1.out a1;
done
ql_mpiexec_finalize -machinefile ./list2 p2.out a2;
The second step is explained using a pseudo-code.
MPI_Init();
Prepare data exchange with preceding / following MPI programs
loop:
foreach Fortran module
Initialize data using command-line argments, parameter files,
environment variables
Input data from preceding MPI programs / Read snap-shot
Perform main calculation
Output data to following MPI programs / Write snap-shot
/* ql_client() waits for command of ql_mpiexec_{start,finish} */
if (ql_client() == QL_CONTINUE) { goto loop; }
MPI_Finalize();
qlmpilib.h should be included in the code and libql{mpi,fort}.so
should be linked to the executable file.
========================
Restrictions on McKernel
========================
1. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
correctly even if the mmap() returns a success. An access of their
mapping receives the SIGSEGV signal.
2. clone() supports only the following flags. All the other flags
cause clone() to return error or are simply ignored.
* CLONE_CHILD_CLEARTID
* CLONE_CHILD_SETTID
* CLONE_PARENT_SETTID
* CLONE_SETTLS
* CLONE_SIGHAND
* CLONE_VM
3. PAPI has the following restriction.
* Number of counters a user can use at the same time is up to the
number of the physical counters in the processor.
4. msync writes back only the modified pages mapped by the calling process.
5. The following syscalls always return the ENOSYS error.
* migrate_pages()
* move_pages()
* set_robust_list()
6. The following syscalls always return the EOPNOTSUPP error.
* arch_prctl(ARCH_SET_GS)
* signalfd()
7. signalfd4() returns a fd, but signal is not notified through the
fd.
8. set_rlimit sets the limit values but they are not enforced.
9. Address randomization is not supported.
10. brk() extends the heap more than requestd when -h
(--extend-heap-by=)<step> option of mcexec is used with the value
larger than 4 KiB. syscall_pwrite02 of LTP would fail for this
reason. This is because the test expects that the end of the heap
is set to the same address as the argument of sbrk() and expects a
segmentation violation occurs when it tries to access the memory
area right next to the boundary. However, the optimization sets
the end to a value larger than the requested. Therefore, the
expected segmentation violation doesn't occur.
11. setpriority()/getpriority() won't work. They might set/get the
priority of a random mcexec thread. This is because there's no
fixed correspondence between a McKernel thread which issues the
system call and a mcexec thread which handles the offload request.
12. mbind() can set the policy but it is not used when allocating
physical pages.
13. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
set_mempolicy()/mbind() are not supported.
14. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
as the MPOL_PREFERRED policy. That is, the physical page allocator
doesn't give up the allocation when the specified nodes are
running out of pages but continues to search pages in the other
nodes.
15. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
later. In addition, crash_kexec_post_notifiers kernel argument
must be given to Linux kernel.
16. setfsuid()/setfsgid() cannot change the id of the calling thread.
Instead, it changes that of the mcexec worker thread which takes
the system-call offload request.
17. mmap (hugeTLBfs): The physical pages corresponding to a map are
released when no McKernel process exist. The next map gets fresh
physical pages.
18. Sticky bit on executable file has no effect.
19. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
process of booting McKernel due to the Linux bug, found in
Linux-3.10 and fixed in the later version. One way to circumvent
this is to always assign the same CPU set to McKernel.
20. madvise:
* MADV_HWPOISON and MADV_SOFT_OFFLINE always returns -EPERM.
* MADV_MERGEABLE and MADV_UNMERGEABLE always returns -EINVAL.
* MADV_HUGEPAGE and MADV_NOHUGEPAGE on file map returns -EINVAL
(It succeeds on RHEL-8 for aarch64).
21. brk() and mmap() doesn't report out-of-memory through its return
value. Instead, page-fault reports the error.
22. Anonymous mmap pre-maps requested number of pages when contiguous
pages are available. Demand paging is used when not available.
23. Mixing page sizes in anonymous shared mapping is not allowed. mmap
creates vm_range with one page size. And munmap or mremap that
needs the reduced page size changes the sizes of all the pages of
the vm_range.
24. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
(job-scheduler).
25. The behaviors of madvise and mbind are changed to do nothing and
report success as a workaround for Fugaku.
26. mmap() allows unlimited overcommit. Note that it corresponds to
setting sysctl ``vm.overcommit_memory`` to 1.

109
README.md
View File

@ -10,7 +10,7 @@ IHK/McKernel is a light-weight multi-kernel operating system designed for high-e
## Contents
- [Background] (#background)
- [Background](#background-and-motivation)
- [Architectural Overview](#architectural-overview)
- [Installation](#installation)
- [The Team](#the-team)
@ -85,7 +85,7 @@ sudo reboot
You will need the following packages installed:
~~~~
sudo yum install kernel-devel binutils-devel libnuma-devel
sudo yum install cmake kernel-devel binutils-devel systemd-devel numactl-devel gcc make nasm git
~~~~
Grant read permission to the System.map file of your kernel version:
@ -96,24 +96,51 @@ sudo chmod a+r /boot/System.map-`uname -r`
##### 4. Obtain sources and compile the kernel
Clone the source code and set up ihk symlink (this is currently required):
Clone the source code:
~~~~
mkdir -p ~/src/ihk+mckernel/
cd ~/src/ihk+mckernel/
git clone -r git@github.com:RIKEN-SysSoft/mckernel.git
git clone --recursive -b development https://github.com/RIKEN-SysSoft/mckernel.git
~~~~
(Optional) Checkout to the specific branch or version:
~~~~
cd mckernel
git checkout <pathspec>
git submodule update
~~~~
Foe example, if you want to try the development branch, use "development" as the pathspec. If you want to try the prerelease version 1.7.0-0.2, use "1.7.0-0.2".
###### 4.1 Install with cmake
Configure and compile:
~~~~
mkdir -p build && cd build
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/mckernel
cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel $HOME/src/ihk+mckernel/mckernel
make -j install
~~~~
The IHK kernel modules and McKernel kernel image should be installed under the **ihk+mckernel** folder in your home directory.
###### 4.2 Install with rpm
Configure, compile and build rpm:
~~~~
mkdir -p build && cd build
cmake $HOME/src/ihk+mckernel/mckernel
make dist
cp mckernel-<version>.tar.gz <rpmbuild>/SOURCES
rpm -ba scripts/mckernel.spec
sudo rpm -ivh <rpmbuild>/RPMS/<arch>/mckernel-<version>-<release>_<linux_kernel_ver>_<dist>.<arch>.rpm
~~~~
The IHK kernel modules and McKernel kernel image are installed under the system directory.
##### 5. Boot McKernel
A boot script called mcreboot.sh is provided under sbin in the install folder. To boot on logical CPU 1 with 512MB of memory, use the following invocation:
@ -170,6 +197,71 @@ Finally, to shutdown McKernel and release CPU/memory resources back to Linux use
sudo ./sbin/mcstop+release.sh
~~~~
##### 7. Advanced: Enable Utility Thread offloading Interface (UTI)
UTI enables a runtime such as MPI runtime to spawn utility threads such as MPI asynchronous progress threads to Linux cores.
1. Install capstone
Install EPEL capstone-devel:
~~~~
sudo yum install epel-release
sudo yum install capstone-devel
~~~~
2. Install syscall_intercept
~~~~
git clone https://github.com/RIKEN-SysSoft/syscall_intercept.git
cmake ../arch/aarch64 -DCMAKE_INSTALL_PREFIX=<syscall-intercept-install> -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DTREAT_WARNINGS_AS_ERRORS=OFF
~~~~
3. Install UTI for McKernel
Install:
~~~~
git clone https://github.com/RIKEN-SysSoft/uti.git
mkdir build && cd build
../uti/configure --prefix=<mckernel-install> --with-rm=mckernel
make && make install
~~~~
4. Install McKernel
~~~~
CMAKE_PREFIX_PATH=<syscall-intercept-install> cmake -DCMAKE_INSTALL_PREFIX=${HOME}/ihk+mckernel -DENABLE_UTI=ON $HOME/src/ihk+mckernel/mckernel
~~~~
5. Run executable
~~~~
mcexec --enable-uti <command>
~~~~
6. Install UTI for Linux for performance comparison
Install by make:
~~~~
git clone https://github.com/RIKEN-SysSoft/uti.git
mkdir build && cd build
../uti/configure --prefix=<uti-install> --with-rm=linux
make && make install
~~~~
Install by rpm:
~~~~
git clone https://github.com/RIKEN-SysSoft/uti.git
mkdir build && cd build
../uti/configure --prefix=<uti-install> --with-rm=linux
rm -f ~/rpmbuild/SOURCES/<version>.tar.gz
rpmbuild -ba ./scripts/uti.spec
rpm -Uvh uti-<version>-<release>-<arch>.rpm
~~~~
## The Team
The McKernel project was started at The University of Tokyo and currently it is mainly developed at RIKEN.
@ -184,3 +276,10 @@ Some of our collaborators include:
## License
McKernel is GPL licensed, as found in the LICENSE file.
## Contact
Please give your feedback to us via one of the following mailing lists. Subscription via [www.pccluster.org](http://www.pccluster.org/mailman/listinfo/mckernel-users) is needed.
* English: mckernel-users@pccluster.org
* Japanese: mckernel-users-jp@pccluster.org

View File

@ -1,4 +1,4 @@
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* assert.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
#include <process.h>
#include <list.h>
@ -53,4 +53,4 @@ STATIC_ASSERT(SVE_PT_FPSIMD_OFFSET == sizeof(struct user_sve_header));
STATIC_ASSERT(SVE_PT_SVE_OFFSET == sizeof(struct user_sve_header));
/* assert for struct arm64_cpu_local_thread member offset define */
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 160);
STATIC_ASSERT(offsetof(struct arm64_cpu_local_thread, panic_regs) == 168);

View File

@ -1,9 +1,15 @@
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2016 */
/* coredump.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
#include <process.h>
#include <elfcore.h>
#include <string.h>
#include <ptrace.h>
#include <cls.h>
#include <hwcap.h>
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
#define align32(x) ((((x) + 3) / 4) * 4)
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
struct thread *thread, void *regs0, int sig)
{
struct pt_regs *regs = regs0;
struct elf_prstatus64 tmp_prstatus;
@ -14,8 +20,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
short int pr_cursig;
a8_uint64_t pr_sigpend;
a8_uint64_t pr_sighold;
pid_t pr_pid;
pid_t pr_ppid;
pid_t pr_pgrp;
pid_t pr_sid;
struct prstatus64_timeval pr_utime;
@ -23,10 +27,66 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
struct prstatus64_timeval pr_cutime;
struct prstatus64_timeval pr_cstime;
*/
/* copy x0-30, sp, pc, pstate */
memcpy(&tmp_prstatus.pr_reg, &regs->user_regs, sizeof(tmp_prstatus.pr_reg));
tmp_prstatus.pr_fpvalid = 0; /* We assume no fp */
/* copy unaligned prstatus addr */
memcpy(prstatus, &tmp_prstatus, sizeof(*prstatus));
prstatus->pr_pid = thread->tid;
if (thread->proc->parent) {
prstatus->pr_ppid = thread->proc->parent->pid;
}
prstatus->pr_info.si_signo = sig;
prstatus->pr_cursig = sig;
}
int arch_get_thread_core_info_size(void)
{
const struct user_regset_view *view = current_user_regset_view();
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return 0;
}
return sizeof(struct note) + align32(sizeof("LINUX"))
+ regset_size(cpu_local_var(current), regset);
}
void arch_fill_thread_core_info(struct note *head,
struct thread *thread, void *regs)
{
const struct user_regset_view *view = current_user_regset_view();
const struct user_regset *regset = find_regset(view, NT_ARM_SVE);
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return;
}
/* pre saved registers */
save_fp_regs(thread);
if (regset->core_note_type && regset->get &&
(!regset->active || regset->active(thread, regset))) {
int ret;
size_t size = regset_size(thread, regset);
void *namep;
void *descp;
namep = (void *) (head + 1);
descp = namep + align32(sizeof("LINUX"));
ret = regset->get(thread, regset, 0, size, descp, NULL);
if (ret) {
return;
}
head->namesz = sizeof("LINUX");
head->descsz = size;
head->type = NT_ARM_SVE;
memcpy(namep, "LINUX", sizeof("LINUX"));
}
}

View File

@ -1,6 +1,5 @@
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* cpu.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <ihk/mm.h>
#include <types.h>
#include <errno.h>
@ -30,9 +29,11 @@
#include <debug-monitors.h>
#include <sysreg.h>
#include <cpufeature.h>
#include <debug.h>
#include <ihk/debug.h>
#include <hwcap.h>
#include <virt.h>
#include <init.h>
#include <bootparam.h>
//#define DEBUG_PRINT_CPU
@ -67,6 +68,7 @@ void (*gic_dist_init)(unsigned long dist_base_pa, unsigned long size);
void (*gic_cpu_init)(unsigned long cpu_base_pa, unsigned long size);
void (*gic_enable)(void);
void (*arm64_issue_ipi)(unsigned int cpid, unsigned int vector);
void (*arm64_issue_host_ipi)(unsigned int cpid, unsigned int vector);
void (*handle_arch_irq)(struct pt_regs *);
static void gic_init(void)
@ -77,14 +79,18 @@ static void gic_init(void)
gic_cpu_init = gic_cpu_init_gicv3;
gic_enable = gic_enable_gicv3;
arm64_issue_ipi = arm64_issue_ipi_gicv3;
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv3;
handle_arch_irq = handle_interrupt_gicv3;
kprintf("%: GICv3\n", __func__);
} else {
/* Setup functions for GICv2 */
gic_dist_init = gic_dist_init_gicv2;
gic_cpu_init = gic_cpu_init_gicv2;
gic_enable = gic_enable_gicv2;
arm64_issue_ipi = arm64_issue_ipi_gicv2;
arm64_issue_host_ipi = arm64_issue_host_ipi_gicv2;
handle_arch_irq = handle_interrupt_gicv2;
kprintf("%: GICv2\n", __func__);
}
gic_dist_init(ihk_param_gic_dist_base_pa, ihk_param_gic_dist_map_size);
@ -114,42 +120,94 @@ static struct ihk_mc_interrupt_handler cpu_stop_handler = {
};
extern long freeze_thaw(void *nmi_ctx);
static void multi_nm_interrupt_handler(void *priv)
static void multi_interrupt_handler(void *priv)
{
extern int nmi_mode;
struct pt_regs *regs = (struct pt_regs *)priv;
union arm64_cpu_local_variables *clv;
switch (nmi_mode) {
switch (multi_intr_mode) {
case 1:
case 2:
/* mode == 1or2, for FREEZER NMI */
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
__func__, nmi_mode);
case 2: /* mode == 1or2, for FREEZER intr */
dkprintf("%s: freeze mode intr catch. (multi_intr_mode=%d)\n",
__func__, multi_intr_mode);
freeze_thaw(NULL);
break;
default:
ekprintf("%s: Unknown multi-intr-mode(%d) detected.\n",
__func__, multi_intr_mode);
break;
}
}
void arch_save_panic_regs(void *irq_regs)
{
struct pt_regs *regs = (struct pt_regs *)irq_regs;
union arm64_cpu_local_variables *clv;
clv = get_arm64_this_cpu_local();
/* For user-space, use saved kernel context */
if (regs->pc < USER_END) {
memset(clv->arm64_cpu_local_thread.panic_regs,
0, sizeof(clv->arm64_cpu_local_thread.panic_regs));
clv->arm64_cpu_local_thread.panic_regs[29] =
current_thread_info()->cpu_context.fp;
clv->arm64_cpu_local_thread.panic_regs[31] =
current_thread_info()->cpu_context.sp;
clv->arm64_cpu_local_thread.panic_regs[32] =
current_thread_info()->cpu_context.pc;
clv->arm64_cpu_local_thread.panic_regs[33] =
PSR_MODE_EL1h;
}
else {
memcpy(clv->arm64_cpu_local_thread.panic_regs,
regs->regs, sizeof(regs->regs));
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
clv->arm64_cpu_local_thread.panic_regs[33] =
regs->pstate;
}
clv->arm64_cpu_local_thread.paniced = 1;
}
void arch_clear_panic(void)
{
union arm64_cpu_local_variables *clv;
clv = get_arm64_this_cpu_local();
clv->arm64_cpu_local_thread.paniced = 0;
}
static struct ihk_mc_interrupt_handler multi_intr_handler = {
.func = multi_interrupt_handler,
.priv = NULL,
};
static void multi_nm_interrupt_handler(void *irq_regs)
{
extern int nmi_mode;
dkprintf("%s: ...\n", __func__);
switch (nmi_mode) {
case 0:
/* mode == 0, for MEMDUMP NMI */
clv = get_arm64_this_cpu_local();
if (regs) {
memcpy(clv->arm64_cpu_local_thread.panic_regs,
regs->regs, sizeof(regs->regs));
clv->arm64_cpu_local_thread.panic_regs[31] = regs->sp;
clv->arm64_cpu_local_thread.panic_regs[32] = regs->pc;
clv->arm64_cpu_local_thread.panic_regs[33] =
regs->pstate;
}
clv->arm64_cpu_local_thread.paniced = 1;
/* mode == 0, for MEMDUMP NMI */
arch_save_panic_regs(irq_regs);
ihk_mc_query_mem_areas();
/* memdump-nmi is halted McKernel, break is unnecessary. */
/* fall through */
case 3:
/* mode == 3, for SHUTDOWN-WAIT NMI */
while (1) {
/* mode == 3, for SHUTDOWN-WAIT NMI */
kprintf("%s: STOP\n", __func__);
while (nmi_mode != 4)
cpu_halt();
break;
case 4:
/* mode == 4, continue NMI */
arch_clear_panic();
if (!ihk_mc_get_processor_id()) {
ihk_mc_clear_dump_page_completion();
}
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
break;
default:
@ -423,6 +481,8 @@ void ihk_mc_init_ap(void)
ihk_mc_register_interrupt_handler(INTRID_CPU_STOP, &cpu_stop_handler);
ihk_mc_register_interrupt_handler(INTRID_MULTI_NMI, &multi_nmi_handler);
ihk_mc_register_interrupt_handler(INTRID_MULTI_INTR,
&multi_intr_handler);
ihk_mc_register_interrupt_handler(
ihk_mc_get_vector(IHK_TLB_FLUSH_IRQ_VECTOR_START),
&remote_tlb_flush_handler);
@ -776,6 +836,21 @@ unsigned long cpu_disable_interrupt_save(void)
return flags;
}
/* save ICC_PMR_EL1 & enable interrupt (ICC_PMR_EL1 <= ICC_PMR_EL1_UNMASKED) */
unsigned long cpu_enable_interrupt_save(void)
{
unsigned long flags;
unsigned long masked = ICC_PMR_EL1_UNMASKED;
asm volatile(
"mrs_s %0, " __stringify(ICC_PMR_EL1) "\n"
"msr_s " __stringify(ICC_PMR_EL1) ",%1"
: "=&r" (flags)
: "r" (masked)
: "memory");
return flags;
}
#else /* defined(CONFIG_HAS_NMI) */
/* @ref.impl arch/arm64/include/asm/irqflags.h::arch_local_irq_enable */
@ -824,6 +899,20 @@ unsigned long cpu_disable_interrupt_save(void)
: "memory");
return flags;
}
/* save PSTATE.DAIF & enable interrupt (PSTATE.DAIF I bit set) */
unsigned long cpu_enable_interrupt_save(void)
{
unsigned long flags;
asm volatile(
"mrs %0, daif // arch_local_irq_save\n"
"msr daifclr, #2"
: "=r" (flags)
:
: "memory");
return flags;
}
#endif /* defined(CONFIG_HAS_NMI) */
/* we not have "pause" instruction, instead "yield" instruction */
@ -951,7 +1040,7 @@ void ihk_mc_boot_cpu(int cpuid, unsigned long pc)
setup_cpu_features();
}
init_sve_vl();
sve_setup();
}
/* for ihk_mc_init_context() */
@ -986,6 +1075,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
/* branch in ret_from_fork */
new_ctx->thread->cpu_context.x19 = (unsigned long)next_function;
sp -= 16;
new_ctx->thread->cpu_context.fp = sp;
/* set stack_pointer */
new_ctx->thread->cpu_context.sp = sp - sizeof(ihk_mc_user_context_t);
@ -1001,9 +1093,10 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
const int lcpuid = ihk_mc_get_processor_id();
const unsigned long syscallno = current_pt_regs()->syscallno;
#ifdef CONFIG_ARM64_SVE
const uint16_t orig_sve_vl = current_thread_info()->sve_vl;
const uint16_t orig_sve_vl_onexec = current_thread_info()->sve_vl_onexec;
const uint16_t orig_sve_flags = current_thread_info()->sve_flags;
struct thread_info *ti = current_thread_info();
const unsigned int orig_sve_vl = ti->sve_vl;
const unsigned int orig_sve_vl_onexec = ti->sve_vl_onexec;
const unsigned long orig_sve_flags = ti->sve_flags;
#endif /* CONFIG_ARM64_SVE */
/* get kernel stack address */
@ -1023,6 +1116,9 @@ void ihk_mc_init_context(ihk_mc_kernel_context_t *new_ctx,
/* set stack_pointer */
new_ctx->thread->cpu_context.sp = sp;
/* use the 16 bytes padding in ihk_mc_init_user_process()
* as closing frame in the frame chain */
new_ctx->thread->cpu_context.fp = sp + sizeof(ihk_mc_user_context_t);
/* clear pt_regs area */
new_uctx = (ihk_mc_user_context_t *)new_ctx->thread->cpu_context.sp;
@ -1183,7 +1279,7 @@ long ihk_mc_show_cpuinfo(char *buf, size_t buf_size, unsigned long read_off, int
/* generate strings */
loff += scnprintf(lbuf + loff, lbuf_size - loff,
"processor\t: %d\n", cpuinfo->hwid);
"processor\t: %d\n", i);
loff += scnprintf(lbuf + loff, lbuf_size - loff, "Features\t:");
for (j = 0; hwcap_str[j]; j++) {
@ -1234,7 +1330,6 @@ err:
}
static int check_and_allocate_fp_regs(struct thread *thread);
void save_fp_regs(struct thread *thread);
void arch_clone_thread(struct thread *othread, unsigned long pc,
unsigned long sp, struct thread *nthread)
@ -1346,11 +1441,15 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
}
/*@
@ requires \valid_apicid(cpu); // valid APIC ID or not
@ requires \valid_cpuid(cpu); // valid CPU logical ID
@ ensures \result == 0
@*/
int ihk_mc_interrupt_cpu(int cpu, int vector)
{
if (cpu < 0 || cpu >= num_processors) {
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
return -1;
}
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
(*arm64_issue_ipi)(cpu, vector);
return 0;
@ -1398,6 +1497,19 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
}
}
#endif /*ENABLE_PERF*/
#ifdef PROFILE_ENABLE
if (prev && prev->profile && prev->profile_start_ts != 0) {
prev->profile_elapsed_ts +=
(rdtsc() - prev->profile_start_ts);
prev->profile_start_ts = 0;
}
if (next->profile && next->profile_start_ts == 0) {
next->profile_start_ts = rdtsc();
}
#endif
if (likely(prev)) {
tls_thread_switch(prev, next);
@ -1471,8 +1583,7 @@ check_and_allocate_fp_regs(struct thread *thread)
if (!thread->fp_regs) {
kprintf("error: allocating fp_regs pages\n");
result = 1;
panic("panic: error allocating fp_regs pages");
result = -ENOMEM;
goto out;
}
@ -1481,37 +1592,51 @@ check_and_allocate_fp_regs(struct thread *thread)
#ifdef CONFIG_ARM64_SVE
if (likely(elf_hwcap & HWCAP_SVE)) {
sve_alloc(thread);
result = sve_alloc(thread);
}
#endif /* CONFIG_ARM64_SVE */
out:
if (result) {
release_fp_regs(thread);
}
return result;
}
/*@
@ requires \valid(thread);
@*/
void
int
save_fp_regs(struct thread *thread)
{
int ret = 0;
if (thread == &cpu_local_var(idle)) {
return;
goto out;
}
if (likely(elf_hwcap & (HWCAP_FP | HWCAP_ASIMD))) {
if (check_and_allocate_fp_regs(thread) != 0) {
// alloc error.
return;
ret = check_and_allocate_fp_regs(thread);
if (ret) {
goto out;
}
thread_fpsimd_save(thread);
}
out:
return ret;
}
void copy_fp_regs(struct thread *from, struct thread *to)
int copy_fp_regs(struct thread *from, struct thread *to)
{
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
int ret = 0;
if (from->fp_regs != NULL) {
ret = check_and_allocate_fp_regs(to);
if (!ret) {
memcpy(to->fp_regs,
from->fp_regs,
sizeof(fp_regs_struct));
}
}
return ret;
}
void clear_fp_regs(void)
@ -1626,6 +1751,7 @@ static inline int arch_cpu_mrs(uint32_t sys_reg, uint64_t *val)
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE5_EL0);
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE6_EL0);
SYSREG_READ_S(IMP_PF_INJECTION_DISTANCE7_EL0);
SYSREG_READ_S(IMP_PF_PMUSERENR_EL0);
SYSREG_READ_S(IMP_BARRIER_CTRL_EL1);
SYSREG_READ_S(IMP_BARRIER_BST_BIT_EL1);
SYSREG_READ_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
@ -1696,6 +1822,7 @@ static inline int arch_cpu_msr(uint32_t sys_reg, uint64_t val)
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE5_EL0);
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE6_EL0);
SYSREG_WRITE_S(IMP_PF_INJECTION_DISTANCE7_EL0);
SYSREG_WRITE_S(IMP_PF_PMUSERENR_EL0);
SYSREG_WRITE_S(IMP_BARRIER_CTRL_EL1);
SYSREG_WRITE_S(IMP_BARRIER_BST_BIT_EL1);
SYSREG_WRITE_S(IMP_BARRIER_INIT_SYNC_BB0_EL1);
@ -1753,6 +1880,11 @@ int arch_cpu_read_write_register(
ret = -1;
}
dkprintf("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
__FUNCTION__,
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
desc->addr, desc->val);
return ret;
}
@ -1762,4 +1894,9 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
return -1;
}
void arch_flush_icache_all(void)
{
asm("ic ialluis");
dsb(ish);
}
/*** end of file ***/

View File

@ -970,7 +970,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
#ifdef CONFIG_ARM64_SVE
HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_SVE_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_SVE),
#endif
{},
{ 0 },
};
/* @ref.impl arch/arm64/kernel/cpufeature.c */

View File

@ -10,5 +10,5 @@ struct cpu_info cpu_table[] = {
.cpu_name = "AArch64 Processor",
.cpu_setup = __cpu_setup,
},
{ /* Empty */ },
{ 0 },
};

View File

@ -2,7 +2,6 @@
#include <cputype.h>
#include <irqflags.h>
#include <ihk/context.h>
#include <ihk/debug.h>
#include <signal.h>
#include <errno.h>
#include <debug-monitors.h>

View File

@ -1,4 +1,4 @@
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
/* fpsimd.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
#include <thread_info.h>
#include <fpsimd.h>
#include <cpuinfo.h>
@ -9,8 +9,9 @@
#include <prctl.h>
#include <cpufeature.h>
#include <kmalloc.h>
#include <debug.h>
#include <ihk/debug.h>
#include <process.h>
#include <bitmap.h>
//#define DEBUG_PRINT_FPSIMD
@ -21,11 +22,87 @@
#ifdef CONFIG_ARM64_SVE
/* Set of available vector lengths, as vq_to_bit(vq): */
static DECLARE_BITMAP(sve_vq_map, SVE_VQ_MAX);
/* Maximum supported vector length across all CPUs (initially poisoned) */
int sve_max_vl = -1;
/* Default VL for tasks that don't set it explicitly: */
int sve_default_vl = -1;
/*
* Helpers to translate bit indices in sve_vq_map to VQ values (and
* vice versa). This allows find_next_bit() to be used to find the
* _maximum_ VQ not exceeding a certain value.
*/
static unsigned int vq_to_bit(unsigned int vq)
{
return SVE_VQ_MAX - vq;
}
static unsigned int bit_to_vq(unsigned int bit)
{
if (bit >= SVE_VQ_MAX) {
bit = SVE_VQ_MAX - 1;
}
return SVE_VQ_MAX - bit;
}
/*
* All vector length selection from userspace comes through here.
* We're on a slow path, so some sanity-checks are included.
* If things go wrong there's a bug somewhere, but try to fall back to a
* safe choice.
*/
static unsigned int find_supported_vector_length(unsigned int vl)
{
int bit;
int max_vl = sve_max_vl;
if (!sve_vl_valid(vl)) {
vl = SVE_VL_MIN;
}
if (!sve_vl_valid(max_vl)) {
max_vl = SVE_VL_MIN;
}
if (vl > max_vl) {
vl = max_vl;
}
bit = find_next_bit(sve_vq_map, SVE_VQ_MAX,
vq_to_bit(sve_vq_from_vl(vl)));
return sve_vl_from_vq(bit_to_vq(bit));
}
static void sve_probe_vqs(DECLARE_BITMAP(map, SVE_VQ_MAX))
{
unsigned int vq, vl;
unsigned long zcr;
bitmap_zero(map, SVE_VQ_MAX);
zcr = ZCR_EL1_LEN_MASK;
zcr = read_sysreg_s(SYS_ZCR_EL1) & ~zcr;
for (vq = SVE_VQ_MAX; vq >= SVE_VQ_MIN; --vq) {
/* self-syncing */
write_sysreg_s(zcr | (vq - 1), SYS_ZCR_EL1);
vl = sve_get_vl();
/* skip intervening lengths */
vq = sve_vq_from_vl(vl);
set_bit(vq_to_bit(vq), map);
}
}
void sve_init_vq_map(void)
{
sve_probe_vqs(sve_vq_map);
}
size_t sve_state_size(struct thread const *thread)
{
unsigned int vl = thread->ctx.thread->sve_vl;
@ -42,17 +119,19 @@ void sve_free(struct thread *thread)
}
}
void sve_alloc(struct thread *thread)
int sve_alloc(struct thread *thread)
{
if (thread->ctx.thread->sve_state) {
return;
return 0;
}
thread->ctx.thread->sve_state =
kmalloc(sve_state_size(thread), IHK_MC_AP_NOWAIT);
BUG_ON(!thread->ctx.thread->sve_state);
if (thread->ctx.thread->sve_state == NULL) {
return -ENOMEM;
}
memset(thread->ctx.thread->sve_state, 0, sve_state_size(thread));
return 0;
}
static int get_nr_threads(struct process *proc)
@ -75,19 +154,7 @@ int sve_set_vector_length(struct thread *thread,
{
struct thread_info *ti = thread->ctx.thread;
BUG_ON(thread == cpu_local_var(current) && cpu_local_var(no_preempt) == 0);
/*
* To avoid accidents, forbid setting for individual threads of a
* multithreaded process. User code that knows what it's doing can
* pass PR_SVE_SET_VL_THREAD to override this restriction:
*/
if (!(flags & PR_SVE_SET_VL_THREAD) && get_nr_threads(thread->proc) != 1) {
return -EINVAL;
}
flags &= ~(unsigned long)PR_SVE_SET_VL_THREAD;
if (flags & ~(unsigned long)(PR_SVE_SET_VL_INHERIT |
if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
PR_SVE_SET_VL_ONEXEC)) {
return -EINVAL;
}
@ -96,13 +163,19 @@ int sve_set_vector_length(struct thread *thread,
return -EINVAL;
}
if (vl > sve_max_vl) {
BUG_ON(!sve_vl_valid(sve_max_vl));
vl = sve_max_vl;
/*
* Clamp to the maximum vector length that VL-agnostic SVE code can
* work with. A flag may be assigned in the future to allow setting
* of larger vector lengths without confusing older software.
*/
if (vl > SVE_VL_ARCH_MAX) {
vl = SVE_VL_ARCH_MAX;
}
if (flags & (PR_SVE_SET_VL_ONEXEC |
PR_SVE_SET_VL_INHERIT)) {
vl = find_supported_vector_length(vl);
if (flags & (PR_SVE_VL_INHERIT |
PR_SVE_SET_VL_ONEXEC)) {
ti->sve_vl_onexec = vl;
} else {
/* Reset VL to system default on next exec: */
@ -114,39 +187,42 @@ int sve_set_vector_length(struct thread *thread,
goto out;
}
if (vl != ti->sve_vl) {
if ((elf_hwcap & HWCAP_SVE)) {
fp_regs_struct fp_regs;
memset(&fp_regs, 0, sizeof(fp_regs));
if (vl == ti->sve_vl) {
goto out;
}
/* for self at prctl syscall */
if (thread == cpu_local_var(current)) {
save_fp_regs(thread);
clear_fp_regs();
thread_sve_to_fpsimd(thread, &fp_regs);
sve_free(thread);
if ((elf_hwcap & HWCAP_SVE)) {
fp_regs_struct fp_regs;
ti->sve_vl = vl;
memset(&fp_regs, 0, sizeof(fp_regs));
sve_alloc(thread);
thread_fpsimd_to_sve(thread, &fp_regs);
restore_fp_regs(thread);
/* for target thread at ptrace */
} else {
thread_sve_to_fpsimd(thread, &fp_regs);
sve_free(thread);
/* for self at prctl syscall */
if (thread == cpu_local_var(current)) {
save_fp_regs(thread);
clear_fp_regs();
thread_sve_to_fpsimd(thread, &fp_regs);
sve_free(thread);
ti->sve_vl = vl;
ti->sve_vl = vl;
sve_alloc(thread);
thread_fpsimd_to_sve(thread, &fp_regs);
}
sve_alloc(thread);
thread_fpsimd_to_sve(thread, &fp_regs);
restore_fp_regs(thread);
/* for target thread at ptrace */
} else {
thread_sve_to_fpsimd(thread, &fp_regs);
sve_free(thread);
ti->sve_vl = vl;
sve_alloc(thread);
thread_fpsimd_to_sve(thread, &fp_regs);
}
}
ti->sve_vl = vl;
out:
ti->sve_flags = flags & PR_SVE_SET_VL_INHERIT;
ti->sve_flags = flags & PR_SVE_VL_INHERIT;
return 0;
}
@ -156,44 +232,53 @@ out:
* Encode the current vector length and flags for return.
* This is only required for prctl(): ptrace has separate fields
*/
static int sve_prctl_status(const struct thread_info *ti)
static int sve_prctl_status(unsigned long flags)
{
int ret = ti->sve_vl;
int ret;
struct thread_info *ti = cpu_local_var(current)->ctx.thread;
ret |= ti->sve_flags << 16;
if (flags & PR_SVE_SET_VL_ONEXEC) {
ret = ti->sve_vl_onexec;
}
else {
ret = ti->sve_vl;
}
if (ti->sve_flags & PR_SVE_VL_INHERIT) {
ret |= PR_SVE_VL_INHERIT;
}
return ret;
}
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_set_task_vl */
int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length,
const unsigned long flags)
int sve_set_thread_vl(unsigned long arg)
{
unsigned long vl, flags;
int ret;
if (!(elf_hwcap & HWCAP_SVE)) {
vl = arg & PR_SVE_VL_LEN_MASK;
flags = arg & ~vl;
/* Instead of system_supports_sve() */
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return -EINVAL;
}
BUG_ON(thread != cpu_local_var(current));
preempt_disable();
ret = sve_set_vector_length(thread, vector_length, flags);
preempt_enable();
ret = sve_set_vector_length(cpu_local_var(current), vl, flags);
if (ret) {
return ret;
}
return sve_prctl_status(thread->ctx.thread);
return sve_prctl_status(flags);
}
/* @ref.impl arch/arm64/kernel/fpsimd.c::sve_get_ti_vl */
int sve_get_thread_vl(const struct thread *thread)
int sve_get_thread_vl(void)
{
if (!(elf_hwcap & HWCAP_SVE)) {
/* Instead of system_supports_sve() */
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return -EINVAL;
}
return sve_prctl_status(thread->ctx.thread);
return sve_prctl_status(0);
}
void do_sve_acc(unsigned int esr, struct pt_regs *regs)
@ -203,25 +288,48 @@ void do_sve_acc(unsigned int esr, struct pt_regs *regs)
panic("");
}
void init_sve_vl(void)
void sve_setup(void)
{
extern unsigned long ihk_param_default_vl;
uint64_t zcr;
/* Instead of system_supports_sve() */
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return;
}
zcr = read_system_reg(SYS_ZCR_EL1);
BUG_ON(((zcr & ZCR_EL1_LEN_MASK) + 1) * 16 > sve_max_vl);
/* init sve_vq_map bitmap */
sve_init_vq_map();
/*
* The SVE architecture mandates support for 128-bit vectors,
* so sve_vq_map must have at least SVE_VQ_MIN set.
* If something went wrong, at least try to patch it up:
*/
if (!test_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map)) {
set_bit(vq_to_bit(SVE_VQ_MIN), sve_vq_map);
}
zcr = read_system_reg(SYS_ZCR_EL1);
sve_max_vl = sve_vl_from_vq((zcr & ZCR_EL1_LEN_MASK) + 1);
/*
* Sanity-check that the max VL we determined through CPU features
* corresponds properly to sve_vq_map. If not, do our best:
*/
if (sve_max_vl != find_supported_vector_length(sve_max_vl)) {
sve_max_vl = find_supported_vector_length(sve_max_vl);
}
sve_max_vl = ((zcr & ZCR_EL1_LEN_MASK) + 1) * 16;
sve_default_vl = ihk_param_default_vl;
if (sve_default_vl == 0) {
kprintf("SVE: Getting default VL = 0 from HOST-Linux.\n");
sve_default_vl = sve_max_vl > 64 ? 64 : sve_max_vl;
kprintf("SVE: Using default vl(%d byte).\n", sve_default_vl);
if (ihk_param_default_vl !=
find_supported_vector_length(ihk_param_default_vl)) {
kprintf("SVE: Getting unsupported default VL = %d "
"from HOST-Linux.\n", sve_default_vl);
sve_default_vl = find_supported_vector_length(64);
kprintf("SVE: Using default vl(%d byte).\n",
sve_default_vl);
}
kprintf("SVE: maximum available vector length %u bytes per vector\n",
@ -232,7 +340,7 @@ void init_sve_vl(void)
#else /* CONFIG_ARM64_SVE */
void init_sve_vl(void)
void sve_setup(void)
{
/* nothing to do. */
}

View File

@ -10,7 +10,7 @@
#include <smp.h>
#include <arm-gic-v3.h>
#define KERNEL_RAM_VADDR MAP_KERNEL_START
/* KERNEL_RAM_VADDR is defined by cmake */
//#ifndef CONFIG_SMP
//# define PTE_FLAGS PTE_TYPE_PAGE | PTE_AF

View File

@ -255,90 +255,6 @@ static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
cpu_restore_interrupt(flags);
}
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
unsigned long irqsave;
#ifndef ENABLE_UBSAN
} __aligned(64) mcs_lock_node_t;
#else
} mcs_lock_node_t;
#endif
typedef mcs_lock_node_t mcs_lock_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;
node->next = NULL;
}
static void __mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
node->next = NULL;
node->locked = 0;
pred = xchg8(&(lock->next), node);
if (pred) {
node->locked = 1;
pred->next = node;
while (node->locked != 0) {
cpu_pause();
}
}
}
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
struct mcs_lock_node *old = atomic_cmpxchg8(&(lock->next), node, 0);
if (old == node) {
return;
}
while (node->next == NULL) {
cpu_pause();
}
}
node->next->locked = 0;
}
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
preempt_disable();
__mcs_lock_lock(lock, node);
}
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
__mcs_lock_unlock(lock, node);
preempt_enable();
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
node->irqsave = cpu_disable_interrupt_save();
mcs_lock_lock_noirq(lock, node);
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
mcs_lock_unlock_noirq(lock, node);
cpu_restore_interrupt(node->irqsave);
}
#define SPINLOCK_IN_MCS_RWLOCK
// reader/writer lock
@ -743,5 +659,102 @@ static inline int irqflags_can_interrupt(unsigned long flags)
}
#endif /* CONFIG_HAS_NMI */
struct ihk_rwlock {
unsigned int lock;
};
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
{
rw->lock = 0;
}
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
{
unsigned int tmp, tmp2;
asm volatile(
" sevl\n"
"1: wfe\n"
"2: ldaxr %w0, %2\n"
" add %w0, %w0, #1\n"
" tbnz %w0, #31, 1b\n"
" stxr %w1, %w0, %2\n"
" cbnz %w1, 2b\n"
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
:
: "cc", "memory");
}
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
{
unsigned int tmp, tmp2 = 1;
asm volatile(
" ldaxr %w0, %2\n"
" add %w0, %w0, #1\n"
" tbnz %w0, #31, 1f\n"
" stxr %w1, %w0, %2\n"
"1:\n"
: "=&r" (tmp), "+r" (tmp2), "+Q" (rw->lock)
:
: "cc", "memory");
return !tmp2;
}
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
{
unsigned int tmp, tmp2;
asm volatile(
"1: ldxr %w0, %2\n"
" sub %w0, %w0, #1\n"
" stlxr %w1, %w0, %2\n"
" cbnz %w1, 1b\n"
: "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
:
: "cc", "memory");
}
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
{
unsigned int tmp;
asm volatile(
" sevl\n"
"1: wfe\n"
"2: ldaxr %w0, %1\n"
" cbnz %w0, 1b\n"
" stxr %w0, %w2, %1\n"
" cbnz %w0, 2b\n"
: "=&r" (tmp), "+Q" (rw->lock)
: "r" (0x80000000)
: "cc", "memory");
}
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
{
unsigned int tmp;
asm volatile(
" ldaxr %w0, %1\n"
" cbnz %w0, 1f\n"
" stxr %w0, %w2, %1\n"
"1:\n"
: "=&r" (tmp), "+Q" (rw->lock)
: "r" (0x80000000)
: "cc", "memory");
return !tmp;
}
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
{
asm volatile(
" stlr %w1, %0\n"
: "=Q" (rw->lock) : "r" (0) : "memory");
}
#define ihk_mc_read_can_lock(rw) ((rw)->lock < 0x80000000)
#define ihk_mc_write_can_lock(rw) ((rw)->lock == 0)
#endif /* !__HEADER_ARM64_COMMON_ARCH_LOCK_H */

View File

@ -34,7 +34,7 @@ void panic(const char *);
*/
/* early alloc area address */
/* START:_end, SIZE:512 pages */
#define MAP_EARLY_ALLOC_SHIFT 9
#define MAP_EARLY_ALLOC_SHIFT 5
#define MAP_EARLY_ALLOC_SIZE (UL(1) << (PAGE_SHIFT + MAP_EARLY_ALLOC_SHIFT))
#ifndef __ASSEMBLY__
@ -55,7 +55,11 @@ extern char _end[];
# define MAP_BOOT_PARAM_END (MAP_BOOT_PARAM + MAP_BOOT_PARAM_SIZE)
#endif /* !__ASSEMBLY__ */
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB)
/*
* MAP_KERNEL_START is HOST MODULES_END - 8MiB.
* It's defined by cmake.
*/
#if (VA_BITS == 39 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=1 */
#
# define LD_TASK_UNMAPPED_BASE UL(0x0000000400000000)
# define TASK_UNMAPPED_BASE UL(0x0000000800000000)
@ -64,9 +68,8 @@ extern char _end[];
# define MAP_VMAP_SIZE UL(0x0000000100000000)
# define MAP_FIXED_START UL(0xffffffbffbdfd000)
# define MAP_ST_START UL(0xffffffc000000000)
# define MAP_KERNEL_START UL(0xffffffffff800000)
#
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB)
#elif (VA_BITS == 42 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=3 */
#
# define LD_TASK_UNMAPPED_BASE UL(0x0000002000000000)
# define TASK_UNMAPPED_BASE UL(0x0000004000000000)
@ -75,9 +78,8 @@ extern char _end[];
# define MAP_VMAP_SIZE UL(0x0000000100000000)
# define MAP_FIXED_START UL(0xfffffdfffbdd0000)
# define MAP_ST_START UL(0xfffffe0000000000)
# define MAP_KERNEL_START UL(0xffffffffe0000000)
#
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB)
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ4KB) /* ARM64_MEMORY_LAYOUT=2 */
#
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
@ -86,9 +88,8 @@ extern char _end[];
# define MAP_VMAP_SIZE UL(0x0000000100000000)
# define MAP_FIXED_START UL(0xffff7ffffbdfd000)
# define MAP_ST_START UL(0xffff800000000000)
# define MAP_KERNEL_START UL(0xffffffffff800000)
#
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB)
#elif (VA_BITS == 48 && GRANULE_SIZE == _SZ64KB) /* ARM64_MEMORY_LAYOUT=4 */
#
# define LD_TASK_UNMAPPED_BASE UL(0x0000080000000000)
# define TASK_UNMAPPED_BASE UL(0x0000100000000000)
@ -97,7 +98,6 @@ extern char _end[];
# define MAP_VMAP_SIZE UL(0x0000000100000000)
# define MAP_FIXED_START UL(0xffff7ffffbdd0000)
# define MAP_ST_START UL(0xffff800000000000)
# define MAP_KERNEL_START UL(0xffffffffe0000000)
#
#else
# error address space is not defined.
@ -583,6 +583,40 @@ static inline int pgsize_to_tbllv(size_t pgsize)
return level;
}
static inline int pgsize_to_pgshift(size_t pgsize)
{
/* We need to use if instead of switch because
* sometimes PTLX_CONT_SIZE == PTLX_SIZE
*/
if (pgsize == PTL4_CONT_SIZE) {
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
return PTL4_CONT_SHIFT;
}
} else if (pgsize == PTL4_SIZE) {
if (CONFIG_ARM64_PGTABLE_LEVELS > 3) {
return PTL4_SHIFT;
}
} else if (pgsize == PTL3_CONT_SIZE) {
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
return PTL3_CONT_SHIFT;
}
} else if (pgsize == PTL3_SIZE) {
if (CONFIG_ARM64_PGTABLE_LEVELS > 2) {
return PTL3_SHIFT;
}
} else if (pgsize == PTL2_CONT_SIZE) {
return PTL2_CONT_SHIFT;
} else if (pgsize == PTL2_SIZE) {
return PTL2_SHIFT;
} else if (pgsize == PTL1_CONT_SIZE) {
return PTL1_CONT_SHIFT;
} else if (pgsize == PTL1_SIZE) {
return PTL1_SHIFT;
}
return -EINVAL;
}
static inline size_t tbllv_to_pgsize(int level)
{
size_t pgsize = 0;

View File

@ -20,17 +20,21 @@ struct arm_pmu {
void (*reset)(void*);
int (*enable_pmu)(void);
void (*disable_pmu)(void);
int (*enable_counter)(int);
int (*disable_counter)(int);
int (*enable_intens)(int);
int (*disable_intens)(int);
int (*enable_counter)(unsigned long counter_mask);
int (*disable_counter)(unsigned long counter_mask);
int (*enable_intens)(unsigned long counter_mask);
int (*disable_intens)(unsigned long counter_mask);
int (*set_event_filter)(unsigned long*, int);
void (*write_evtype)(int, uint32_t);
int (*get_event_idx)(int num_events, unsigned long used_mask,
unsigned long config);
int (*map_event)(uint32_t, uint64_t);
int (*map_hw_event)(uint64_t config);
int (*map_cache_event)(uint64_t config);
int (*map_raw_event)(uint64_t config);
void (*enable_user_access_pmu_regs)(void);
void (*disable_user_access_pmu_regs)(void);
int (*counter_mask_valid)(unsigned long counter_mask);
struct per_cpu_arm_pmu *per_cpu;
};

View File

@ -102,4 +102,6 @@ static inline void cpu_disable_nmi(void)
#endif /* __ASSEMBLY__ */
void arch_flush_icache_all(void);
#endif /* !__HEADER_ARM64_ARCH_CPU_H */

View File

@ -1,60 +0,0 @@
#ifndef ARCH_RUSAGE_H_INCLUDED
#define ARCH_RUSAGE_H_INCLUDED
#define DEBUG_RUSAGE
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_2MB 1
#define IHK_OS_PGSIZE_1GB 2
extern struct ihk_os_monitor *monitor;
extern int sprintf(char * buf, const char *fmt, ...);
#define DEBUG_ARCH_RUSAGE
#ifdef DEBUG_ARCH_RUSAGE
#define dprintf(...) \
do { \
char msg[1024]; \
sprintf(msg, __VA_ARGS__); \
kprintf("%s,%s", __FUNCTION__, msg); \
} while (0);
#define eprintf(...) \
do { \
char msg[1024]; \
sprintf(msg, __VA_ARGS__); \
kprintf("%s,%s", __FUNCTION__, msg); \
} while (0);
#else
#define dprintf(...) do { } while (0)
#define eprintf(...) \
do { \
char msg[1024]; \
sprintf(msg, __VA_ARGS__); \
kprintf("%s,%s", __FUNCTION__, msg); \
} while (0);
#endif
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
{
int ret = IHK_OS_PGSIZE_4KB;
#if 0 /* postk-TODO */
switch (pgsize) {
case PTL1_SIZE:
ret = IHK_OS_PGSIZE_4KB;
break;
case PTL2_SIZE:
ret = IHK_OS_PGSIZE_2MB;
break;
case PTL3_SIZE:
ret = IHK_OS_PGSIZE_1GB;
break;
default:
eprintf("unknown pgsize=%ld\n", pgsize);
break;
}
#endif
return ret;
}
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */

View File

@ -1,33 +0,0 @@
#ifndef ARCH_RUSAGE_H_INCLUDED
#define ARCH_RUSAGE_H_INCLUDED
#include <arch-memory.h>
#define DEBUG_RUSAGE
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_2MB 1
#define IHK_OS_PGSIZE_1GB 2
extern struct rusage_global rusage;
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
{
int ret = IHK_OS_PGSIZE_4KB;
if (pgsize == PTL1_SIZE) {
ret = IHK_OS_PGSIZE_4KB;
}
else if (pgsize == PTL2_SIZE) {
ret = IHK_OS_PGSIZE_2MB;
}
else if (pgsize == PTL3_SIZE) {
ret = IHK_OS_PGSIZE_1GB;
}
else {
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
}
return ret;
}
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */

View File

@ -8,6 +8,7 @@
#define SYSCALL_HANDLED(number, name) DECLARATOR(number, name)
#define SYSCALL_DELEGATED(number, name) DECLARATOR(number, name)
#include <config.h>
#include <syscall_list.h>
#undef DECLARATOR

View File

@ -67,21 +67,12 @@ struct arm64_cpu_capabilities {
int def_scope;/* default scope */
int (*matches)(const struct arm64_cpu_capabilities *caps, int scope);
int (*enable)(void *);/* Called on all active CPUs */
union {
struct {/* To be used for erratum handling only */
uint32_t midr_model;
uint32_t midr_range_min, midr_range_max;
};
struct {/* Feature register checking */
uint32_t sys_reg;
uint8_t field_pos;
uint8_t min_field_value;
uint8_t hwcap_type;
int sign;
unsigned long hwcap;
};
};
uint32_t sys_reg;
uint8_t field_pos;
uint8_t min_field_value;
uint8_t hwcap_type;
int sign;
unsigned long hwcap;
};
/* @ref.impl include/linux/bitops.h */

View File

@ -1,4 +1,4 @@
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2017 */
/* fpsimd.h COPYRIGHT FUJITSU LIMITED 2016-2019 */
#ifndef __HEADER_ARM64_COMMON_FPSIMD_H
#define __HEADER_ARM64_COMMON_FPSIMD_H
@ -42,16 +42,19 @@ extern void thread_sve_to_fpsimd(struct thread *thread, fp_regs_struct *fp_regs)
extern size_t sve_state_size(struct thread const *thread);
extern void sve_free(struct thread *thread);
extern void sve_alloc(struct thread *thread);
extern int sve_alloc(struct thread *thread);
extern void sve_save_state(void *state, unsigned int *pfpsr);
extern void sve_load_state(void const *state, unsigned int const *pfpsr, unsigned long vq_minus_1);
extern unsigned int sve_get_vl(void);
extern int sve_set_thread_vl(struct thread *thread, const unsigned long vector_length, const unsigned long flags);
extern int sve_get_thread_vl(const struct thread *thread);
extern int sve_set_thread_vl(unsigned long arg);
extern int sve_get_thread_vl(void);
extern int sve_set_vector_length(struct thread *thread, unsigned long vl, unsigned long flags);
#define SVE_SET_VL(thread, vector_length, flags) sve_set_thread_vl(thread, vector_length, flags)
#define SVE_GET_VL(thread) sve_get_thread_vl(thread)
#define SVE_SET_VL(arg) sve_set_thread_vl(arg)
#define SVE_GET_VL() sve_get_thread_vl()
/* Maximum VL that SVE VL-agnostic software can transparently support */
#define SVE_VL_ARCH_MAX 0x100
#else /* CONFIG_ARM64_SVE */
@ -80,12 +83,12 @@ static int sve_set_vector_length(struct thread *thread, unsigned long vl, unsign
}
/* for prctl syscall */
#define SVE_SET_VL(a,b,c) (-EINVAL)
#define SVE_GET_VL(a) (-EINVAL)
#define SVE_SET_VL(a) (-EINVAL)
#define SVE_GET_VL() (-EINVAL)
#endif /* CONFIG_ARM64_SVE */
extern void init_sve_vl(void);
extern void sve_setup(void);
extern void fpsimd_save_state(struct fpsimd_state *state);
extern void fpsimd_load_state(struct fpsimd_state *state);
extern void thread_fpsimd_save(struct thread *thread);

View File

@ -124,7 +124,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
return *(volatile long *)&(v)->counter64;
}
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
{
v->counter64 = i;
}
@ -147,6 +147,8 @@ static inline void ihk_atomic64_add(long i, ihk_atomic64_t *v)
/* @ref.impl arch/arm64/include/asm/atomic.h::atomic64_inc */
#define ihk_atomic64_inc(v) ihk_atomic64_add(1LL, (v))
#define ihk_atomic64_cmpxchg(p, o, n) cmpxchg(&((p)->counter64), o, n)
/***********************************************************************
* others
*/

View File

@ -29,6 +29,7 @@
#define IMP_PF_INJECTION_DISTANCE5_EL0 sys_reg(3, 3, 11, 7, 5)
#define IMP_PF_INJECTION_DISTANCE6_EL0 sys_reg(3, 3, 11, 7, 6)
#define IMP_PF_INJECTION_DISTANCE7_EL0 sys_reg(3, 3, 11, 7, 7)
#define IMP_PF_PMUSERENR_EL0 sys_reg(3, 3, 9, 14, 0)
#define IMP_BARRIER_CTRL_EL1 sys_reg(3, 0, 11, 12, 0)
#define IMP_BARRIER_BST_BIT_EL1 sys_reg(3, 0, 11, 12, 4)
#define IMP_BARRIER_INIT_SYNC_BB0_EL1 sys_reg(3, 0, 15, 13, 0)

View File

@ -1,4 +1,4 @@
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* irq.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
#ifndef __HEADER_ARM64_IRQ_H
#define __HEADER_ARM64_IRQ_H
@ -14,7 +14,8 @@
#define INTRID_QUERY_FREE_MEM 2
#define INTRID_CPU_STOP 3
#define INTRID_TLB_FLUSH 4
#define INTRID_STACK_TRACE 6
#define INTRID_STACK_TRACE 5
#define INTRID_MULTI_INTR 6
#define INTRID_MULTI_NMI 7
/* use PPI interrupt number */
@ -29,6 +30,7 @@ extern void gic_dist_init_gicv2(unsigned long dist_base_pa, unsigned long size);
extern void gic_cpu_init_gicv2(unsigned long cpu_base_pa, unsigned long size);
extern void gic_enable_gicv2(void);
extern void arm64_issue_ipi_gicv2(unsigned int cpuid, unsigned int vector);
extern void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector);
extern void handle_interrupt_gicv2(struct pt_regs *regs);
/* Functions for GICv3 */
@ -36,6 +38,7 @@ extern void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size);
extern void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size);
extern void gic_enable_gicv3(void);
extern void arm64_issue_ipi_gicv3(unsigned int cpuid, unsigned int vector);
extern void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector);
extern void handle_interrupt_gicv3(struct pt_regs *regs);
void handle_IPI(unsigned int vector, struct pt_regs *regs);

View File

@ -1,4 +1,4 @@
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017 */
/* prctl.h COPYRIGHT FUJITSU LIMITED 2017-2019 */
#ifndef __HEADER_ARM64_COMMON_PRCTL_H
#define __HEADER_ARM64_COMMON_PRCTL_H
@ -6,15 +6,12 @@
#define PR_GET_THP_DISABLE 42
/* arm64 Scalable Vector Extension controls */
#define PR_SVE_SET_VL 48 /* set task vector length */
#define PR_SVE_SET_VL_THREAD (1 << 1) /* set just this thread */
#define PR_SVE_SET_VL_INHERIT (1 << 2) /* inherit across exec */
#define PR_SVE_SET_VL_ONEXEC (1 << 3) /* defer effect until exec */
#define PR_SVE_GET_VL 49 /* get task vector length */
/* Decode helpers for the return value from PR_SVE_GET_VL: */
#define PR_SVE_GET_VL_LEN(ret) ((ret) & 0x3fff) /* vector length */
#define PR_SVE_GET_VL_INHERIT (PR_SVE_SET_VL_INHERIT << 16)
/* For conveinence, PR_SVE_SET_VL returns the result in the same encoding */
/* Flag values must be kept in sync with ptrace NT_ARM_SVE interface */
#define PR_SVE_SET_VL 50 /* set task vector length */
# define PR_SVE_SET_VL_ONEXEC (1 << 18) /* defer effect until exec */
#define PR_SVE_GET_VL 51 /* get task vector length */
/* Bits common to PR_SVE_SET_VL and PR_SVE_GET_VL */
# define PR_SVE_VL_LEN_MASK 0xffff
# define PR_SVE_VL_INHERIT (1 << 17) /* inherit across exec */
#endif /* !__HEADER_ARM64_COMMON_PRCTL_H */

View File

@ -1,4 +1,4 @@
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2017 */
/* ptrace.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
#ifndef __HEADER_ARM64_COMMON_PTRACE_H
#define __HEADER_ARM64_COMMON_PTRACE_H
@ -46,6 +46,7 @@
#ifndef __ASSEMBLY__
#include <lwk/compiler.h>
#include <ihk/types.h>
struct user_hwdebug_state {
@ -78,6 +79,70 @@ struct user_sve_header {
uint16_t __reserved;
};
enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
REGSET_TLS,
REGSET_HW_BREAK,
REGSET_HW_WATCH,
REGSET_SYSTEM_CALL,
#ifdef CONFIG_ARM64_SVE
REGSET_SVE,
#endif /* CONFIG_ARM64_SVE */
};
struct thread;
struct user_regset;
typedef int user_regset_active_fn(struct thread *target,
const struct user_regset *regset);
typedef long user_regset_get_fn(struct thread *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf);
typedef long user_regset_set_fn(struct thread *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf);
typedef int user_regset_writeback_fn(struct thread *target,
const struct user_regset *regset,
int immediate);
typedef unsigned int user_regset_get_size_fn(struct thread *target,
const struct user_regset *regset);
struct user_regset {
user_regset_get_fn *get;
user_regset_set_fn *set;
user_regset_active_fn *active;
user_regset_writeback_fn *writeback;
user_regset_get_size_fn *get_size;
unsigned int n;
unsigned int size;
unsigned int align;
unsigned int bias;
unsigned int core_note_type;
};
struct user_regset_view {
const char *name;
const struct user_regset *regsets;
unsigned int n;
uint32_t e_flags;
uint16_t e_machine;
uint8_t ei_osabi;
};
extern const struct user_regset_view *current_user_regset_view(void);
extern const struct user_regset *find_regset(
const struct user_regset_view *view,
unsigned int type);
extern unsigned int regset_size(struct thread *target,
const struct user_regset *regset);
/* Definitions for user_sve_header.flags: */
#define SVE_PT_REGS_MASK (1 << 0)
@ -85,7 +150,7 @@ struct user_sve_header {
#define SVE_PT_REGS_SVE SVE_PT_REGS_MASK
#define SVE_PT_VL_THREAD PR_SVE_SET_VL_THREAD
#define SVE_PT_VL_INHERIT PR_SVE_SET_VL_INHERIT
#define SVE_PT_VL_INHERIT PR_SVE_VL_INHERIT
#define SVE_PT_VL_ONEXEC PR_SVE_SET_VL_ONEXEC
/*
@ -99,7 +164,9 @@ struct user_sve_header {
*/
/* Offset from the start of struct user_sve_header to the register data */
#define SVE_PT_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
#define SVE_PT_REGS_OFFSET \
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
/*
* The register data content and layout depends on the value of the
@ -174,8 +241,10 @@ struct user_sve_header {
#define SVE_PT_SVE_FFR_OFFSET(vq) \
__SVE_SIG_TO_PT(SVE_SIG_FFR_OFFSET(vq))
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + 15) / 16 * 16)
#define SVE_PT_SVE_FPSR_OFFSET(vq) \
((SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq) + \
(SVE_VQ_BYTES - 1)) \
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
#define SVE_PT_SVE_FPCR_OFFSET(vq) \
(SVE_PT_SVE_FPSR_OFFSET(vq) + SVE_PT_SVE_FPSR_SIZE)
@ -184,9 +253,10 @@ struct user_sve_header {
* 128-bit boundary.
*/
#define SVE_PT_SVE_SIZE(vq, flags) \
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE - \
SVE_PT_SVE_OFFSET + 15) / 16 * 16)
#define SVE_PT_SVE_SIZE(vq, flags) \
((SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE \
- SVE_PT_SVE_OFFSET + (SVE_VQ_BYTES - 1)) \
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
#define SVE_PT_SIZE(vq, flags) \
(((flags) & SVE_PT_REGS_MASK) == SVE_PT_REGS_SVE ? \

View File

@ -85,7 +85,11 @@ enum __rlimit_resource
__RLIMIT_RTPRIO = 14,
#define RLIMIT_RTPRIO __RLIMIT_RTPRIO
__RLIMIT_NLIMITS = 15,
/* timeout for RT tasks in us */
__RLIMIT_RTTIME = 15,
#define RLIMIT_RTTIME __RLIMIT_RTTIME
__RLIMIT_NLIMITS = 16,
__RLIM_NLIMITS = __RLIMIT_NLIMITS
#define RLIMIT_NLIMITS __RLIMIT_NLIMITS
#define RLIM_NLIMITS __RLIM_NLIMITS

View File

@ -1,4 +1,4 @@
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* signal.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
#ifndef __HEADER_ARM64_COMMON_SIGNAL_H
#define __HEADER_ARM64_COMMON_SIGNAL_H
@ -298,6 +298,7 @@ struct extra_context {
struct _aarch64_ctx head;
void *data; /* 16-byte aligned pointer to the extra space */
uint32_t size; /* size in bytes of the extra space */
uint32_t __reserved[3];
};
#define SVE_MAGIC 0x53564501
@ -318,19 +319,25 @@ struct sve_context {
* The SVE architecture leaves space for future expansion of the
* vector length beyond its initial architectural limit of 2048 bits
* (16 quadwords).
*
* See linux/Documentation/arm64/sve.txt for a description of the VL/VQ
* terminology.
*/
#define SVE_VQ_MIN 1
#define SVE_VQ_MAX 0x200
#define SVE_VQ_BYTES 16 /* number of bytes per quadword */
#define SVE_VL_MIN (SVE_VQ_MIN * 0x10)
#define SVE_VL_MAX (SVE_VQ_MAX * 0x10)
#define SVE_VQ_MIN 1
#define SVE_VQ_MAX 512
#define SVE_VL_MIN (SVE_VQ_MIN * SVE_VQ_BYTES)
#define SVE_VL_MAX (SVE_VQ_MAX * SVE_VQ_BYTES)
#define SVE_NUM_ZREGS 32
#define SVE_NUM_PREGS 16
#define sve_vl_valid(vl) \
((vl) % 0x10 == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
#define sve_vq_from_vl(vl) ((vl) / 0x10)
((vl) % SVE_VQ_BYTES == 0 && (vl) >= SVE_VL_MIN && (vl) <= SVE_VL_MAX)
#define sve_vq_from_vl(vl) ((vl) / SVE_VQ_BYTES)
#define sve_vl_from_vq(vq) ((vq) * SVE_VQ_BYTES)
/*
* The total size of meaningful data in the SVE context in bytes,
@ -365,11 +372,13 @@ struct sve_context {
* Additional data might be appended in the future.
*/
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * 16)
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * 2)
#define SVE_SIG_ZREG_SIZE(vq) ((uint32_t)(vq) * SVE_VQ_BYTES)
#define SVE_SIG_PREG_SIZE(vq) ((uint32_t)(vq) * (SVE_VQ_BYTES / 8))
#define SVE_SIG_FFR_SIZE(vq) SVE_SIG_PREG_SIZE(vq)
#define SVE_SIG_REGS_OFFSET ((sizeof(struct sve_context) + 15) / 16 * 16)
#define SVE_SIG_REGS_OFFSET \
((sizeof(struct sve_context) + (SVE_VQ_BYTES - 1)) \
/ SVE_VQ_BYTES * SVE_VQ_BYTES)
#define SVE_SIG_ZREGS_OFFSET SVE_SIG_REGS_OFFSET
#define SVE_SIG_ZREG_OFFSET(vq, n) \

View File

@ -2,7 +2,7 @@
SYSCALL_DELEGATED(4, io_getevents)
SYSCALL_DELEGATED(17, getcwd)
SYSCALL_DELEGATED(22, epoll_pwait)
SYSCALL_HANDLED(22, epoll_pwait)
SYSCALL_DELEGATED(25, fcntl)
SYSCALL_HANDLED(29, ioctl)
SYSCALL_DELEGATED(35, unlinkat)
@ -17,8 +17,8 @@ SYSCALL_DELEGATED(64, write)
SYSCALL_DELEGATED(66, writev)
SYSCALL_DELEGATED(67, pread64)
SYSCALL_DELEGATED(68, pwrite64)
SYSCALL_DELEGATED(72, pselect6)
SYSCALL_DELEGATED(73, ppoll)
SYSCALL_HANDLED(72, pselect6)
SYSCALL_HANDLED(73, ppoll)
SYSCALL_HANDLED(74, signalfd4)
SYSCALL_DELEGATED(78, readlinkat)
SYSCALL_DELEGATED(80, fstat)
@ -83,6 +83,7 @@ SYSCALL_HANDLED(175, geteuid)
SYSCALL_HANDLED(176, getgid)
SYSCALL_HANDLED(177, getegid)
SYSCALL_HANDLED(178, gettid)
SYSCALL_HANDLED(179, sysinfo)
SYSCALL_DELEGATED(188, msgrcv)
SYSCALL_DELEGATED(189, msgsnd)
SYSCALL_DELEGATED(192, semtimedop)
@ -111,20 +112,16 @@ SYSCALL_HANDLED(236, get_mempolicy)
SYSCALL_HANDLED(237, set_mempolicy)
SYSCALL_HANDLED(238, migrate_pages)
SYSCALL_HANDLED(239, move_pages)
#ifdef PERF_ENABLE
#ifdef ENABLE_PERF
SYSCALL_HANDLED(241, perf_event_open)
#else // PERF_ENABLE
SYSCALL_DELEGATED(241, perf_event_open)
#endif // PERF_ENABLE
SYSCALL_HANDLED(260, wait4)
SYSCALL_HANDLED(261, prlimit64)
SYSCALL_HANDLED(270, process_vm_readv)
SYSCALL_HANDLED(271, process_vm_writev)
#ifdef PERF_ENABLE
SYSCALL_HANDLED(601, pmc_init)
SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
#endif // PERF_ENABLE
SYSCALL_HANDLED(281, execveat)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef PROFILE_ENABLE
SYSCALL_HANDLED(__NR_profile, profile)
@ -132,6 +129,7 @@ SYSCALL_HANDLED(__NR_profile, profile)
SYSCALL_HANDLED(730, util_migrate_inter_kernel)
SYSCALL_HANDLED(731, util_indicate_clone)
SYSCALL_HANDLED(732, get_system)
SYSCALL_HANDLED(733, util_register_desc)
/* McKernel Specific */
SYSCALL_HANDLED(801, swapout)
@ -146,3 +144,9 @@ SYSCALL_HANDLED(1045, signalfd)
SYSCALL_DELEGATED(1049, stat)
SYSCALL_DELEGATED(1060, getpgrp)
SYSCALL_HANDLED(1062, time)
SYSCALL_DELEGATED(1069, epoll_wait)
/* Do not edit the lines including this comment and
* EOF just after it because those are used as a
* robust marker for the autotest patch.
*/

View File

@ -1,4 +1,4 @@
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* thread_info.h COPYRIGHT FUJITSU LIMITED 2015-2019 */
#ifndef __HEADER_ARM64_COMMON_THREAD_INFO_H
#define __HEADER_ARM64_COMMON_THREAD_INFO_H
@ -46,9 +46,9 @@ struct thread_info {
int cpu; /* cpu */
struct cpu_context cpu_context; /* kernel_context */
void *sve_state; /* SVE registers, if any */
uint16_t sve_vl; /* SVE vector length */
uint16_t sve_vl_onexec; /* SVE vl after next exec */
uint16_t sve_flags; /* SVE related flags */
unsigned int sve_vl; /* SVE vector length */
unsigned int sve_vl_onexec; /* SVE vl after next exec */
unsigned long sve_flags; /* SVE related flags */
unsigned long fault_address; /* fault info */
unsigned long fault_code; /* ESR_EL1 value */
};
@ -56,7 +56,7 @@ struct thread_info {
/* Flags for sve_flags (intentionally defined to match the prctl flags) */
/* Inherit sve_vl and sve_flags across execve(): */
#define THREAD_VL_INHERIT PR_SVE_SET_VL_INHERIT
#define THREAD_VL_INHERIT PR_SVE_VL_INHERIT
struct arm64_cpu_local_thread {
struct thread_info thread_info;

View File

@ -4,6 +4,7 @@
#define __ASM_TRAP_H
#include <types.h>
#include <arch-lock.h>
struct pt_regs;

View File

@ -7,7 +7,7 @@
#include <memory.h>
#include <affinity.h>
#include <syscall.h>
#include <debug.h>
#include <ihk/debug.h>
#include <arch-timer.h>
#include <cls.h>
@ -31,10 +31,9 @@ void *cpu_base;
* function, it is not necessary to perform the disable/enable
* interrupts in this function as gic_raise_softirq() .
*/
static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
static void __arm64_raise_sgi_gicv2(unsigned int hw_cpuid, unsigned int vector)
{
/* Build interrupt destination of the target cpu */
unsigned int hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
uint8_t cpu_target_list = gic_hwid_to_affinity(hw_cpuid);
/*
@ -50,6 +49,23 @@ static void arm64_raise_sgi_gicv2(unsigned int cpuid, unsigned int vector)
);
}
static void arm64_raise_sgi_gicv2(uint32_t cpuid, uint32_t vector)
{
/* Build interrupt destination of the target CPU */
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
}
static void arm64_raise_sgi_to_host_gicv2(uint32_t cpuid, uint32_t vector)
{
/* Build interrupt destination of the target Linux/host CPU */
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
__arm64_raise_sgi_gicv2(hw_cpuid, vector);
}
/**
* arm64_raise_spi_gicv2
* @ref.impl nothing.
@ -77,6 +93,11 @@ static void arm64_raise_spi_gicv2(unsigned int cpuid, unsigned int vector)
);
}
void arm64_issue_host_ipi_gicv2(uint32_t cpuid, uint32_t vector)
{
arm64_raise_sgi_to_host_gicv2(cpuid, vector);
}
/**
* arm64_issue_ipi_gicv2
* @param cpuid : hardware cpu id

View File

@ -6,7 +6,7 @@
#include <cputype.h>
#include <process.h>
#include <syscall.h>
#include <debug.h>
#include <ihk/debug.h>
#include <arch-timer.h>
#include <cls.h>
@ -195,15 +195,12 @@ static inline void gic_write_bpr1(uint32_t val)
}
#endif
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
static void __arm64_raise_sgi_gicv3(uint32_t hw_cpuid, uint32_t vector)
{
uint64_t mpidr, cluster_id;
uint16_t tlist;
uint64_t val;
/* Build interrupt destination of the target cpu */
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
/*
* Ensure that stores to Normal memory are visible to the
* other CPUs before issuing the IPI.
@ -239,6 +236,22 @@ static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
}
}
static void arm64_raise_sgi_gicv3(uint32_t cpuid, uint32_t vector)
{
/* Build interrupt destination of the target CPU */
uint32_t hw_cpuid = ihk_mc_get_cpu_info()->hw_ids[cpuid];
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
}
static void arm64_raise_sgi_to_host_gicv3(uint32_t cpuid, uint32_t vector)
{
/* Build interrupt destination of the target Linux/host CPU */
uint32_t hw_cpuid = ihk_mc_get_apicid(cpuid);
__arm64_raise_sgi_gicv3(hw_cpuid, vector);
}
static void arm64_raise_spi_gicv3(uint32_t cpuid, uint32_t vector)
{
uint64_t spi_reg_offset;
@ -268,6 +281,11 @@ static void arm64_raise_lpi_gicv3(uint32_t cpuid, uint32_t vector)
ekprintf("%s called.\n", __func__);
}
void arm64_issue_host_ipi_gicv3(uint32_t cpuid, uint32_t vector)
{
arm64_raise_sgi_to_host_gicv3(cpuid, vector);
}
void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
{
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
@ -292,6 +310,9 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
{
uint64_t irqnr;
const int from_user = interrupt_from_user(regs);
struct cpu_local_var *v = get_this_cpu_local_var();
//unsigned long irqflags;
int do_check = 0;
irqnr = gic_read_iar();
cpu_enable_nmi();
@ -305,10 +326,18 @@ void handle_interrupt_gicv3(struct pt_regs *regs)
}
set_cputime(from_user ? CPUTIME_MODE_K2U : CPUTIME_MODE_K2K_OUT);
/* for migration by IPI */
if (get_this_cpu_local_var()->flags & CPU_FLAG_NEED_MIGRATE) {
schedule();
//irqflags = ihk_mc_spinlock_lock(&v->runq_lock);
/* For migration by IPI or by timesharing */
if (v->flags &
(CPU_FLAG_NEED_MIGRATE | CPU_FLAG_NEED_RESCHED)) {
v->flags &= ~CPU_FLAG_NEED_RESCHED;
do_check = 1;
}
//ihk_mc_spinlock_unlock(&v->runq_lock, irqflags);
if (do_check) {
check_signal(0, regs, 0);
schedule();
}
}
@ -344,9 +373,11 @@ static void init_spi_routing(uint32_t irq, uint32_t linux_cpu)
void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
{
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
extern int spi_table[];
extern int nr_spi_table;
int i;
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
dist_base = map_fixed_area(dist_base_pa, size, 1 /*non chachable*/);
@ -357,6 +388,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
}
#endif
#ifndef IHK_IKC_USE_LINUX_WORK_IRQ
/* initialize spi routing */
for (i = 0; i < nr_spi_table; i++) {
if (spi_table[i] == -1) {
@ -364,6 +396,7 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
}
init_spi_routing(spi_table[i], i);
}
#endif // !IHK_IKC_USE_LINUX_WORK_IRQ
}
void gic_cpu_init_gicv3(unsigned long cpu_base_pa, unsigned long size)

View File

@ -1,6 +1,5 @@
/* memory.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <ihk/mm.h>
#include <types.h>
#include <memory.h>
@ -14,7 +13,7 @@
#include <context.h>
#include <kmalloc.h>
#include <vdso.h>
#include <debug.h>
#include <ihk/debug.h>
#include <rusage_private.h>
#include <cputype.h>
@ -2672,17 +2671,28 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
}
phys = args->phys + (base - start);
if (__page_offset(base, PTL1_CONT_SIZE) == 0) { //check head pte
/* Check if we can begin / end a series of contiguous PTEs */
if (__page_offset(base, PTL1_CONT_SIZE) == 0) {
uintptr_t next_addr = base + PTL1_CONT_SIZE;
if (end < next_addr) {
next_addr = end;
}
// set contiguous bit until the next head pte
// if phys is aligned and range does not end early.
/* Begin the series if physical address is also aligned and
* the range covers the series. Don't start or end it if
* physical address is not aligned or the range ends early.
*/
if (__page_offset(phys | next_addr, PTL1_CONT_SIZE) == 0) {
args->attr[0] |= PTE_CONT;
if (rusage_memory_stat_add(args->range, phys,
PTL1_CONT_SIZE,
PTL1_CONT_SIZE)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__, base, phys,
PTL1_CONT_SIZE, PTL1_CONT_SIZE);
}
} else {
args->attr[0] &= ~PTE_CONT;
}
@ -2692,12 +2702,13 @@ int set_range_l1(void *args0, pte_t *ptep, uintptr_t base, uintptr_t start,
error = 0;
// call memory_stat_rss_add() here because pgshift is resolved here
if (rusage_memory_stat_add(args->range, phys, PTL1_SIZE, PTL1_SIZE)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__, base, phys, PTL1_SIZE, PTL1_SIZE);
} else {
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
__func__, base, phys, PTL1_SIZE, PTL1_SIZE);
if (!(args->attr[0] & PTE_CONT)) {
if (rusage_memory_stat_add(args->range, phys,
PTL1_SIZE, PTL1_SIZE)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__, base, phys,
PTL1_SIZE, PTL1_SIZE);
}
}
out:
@ -2761,7 +2772,9 @@ retry:
phys = args->phys + (base - start);
//check head pte
/* Check if we can begin / end a series of
* contiguous PTEs
*/
if (__page_offset(base, tbl.cont_pgsize) == 0) {
uintptr_t next_addr = base +
tbl.cont_pgsize;
@ -2770,11 +2783,24 @@ retry:
next_addr = end;
}
// set contiguous bit until the
// next head pte if phys is aligned
// and range does not end early.
/* Begin the series if physical address
* is also aligned and the range covers
* the series. Don't start or end it if
* physical address is not aligned or
* the range ends early.
*/
if (__page_offset(phys | next_addr, tbl.cont_pgsize) == 0) {
args->attr[level-1] |= PTE_CONT;
if (rusage_memory_stat_add(args->range,
phys,
tbl.cont_pgsize,
tbl.cont_pgsize)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__,
base, phys,
tbl.cont_pgsize,
tbl.cont_pgsize);
}
} else {
args->attr[level-1] &= ~PTE_CONT;
}
@ -2782,21 +2808,23 @@ retry:
ptl_set(ptep, phys | args->attr[level-1],
level);
error = 0;
dkprintf("set_range_middle(%lx,%lx,%lx,%d):"
"large page. %d %lx\n",
base, start, end, level, error, *ptep);
// Call memory_stat_rss_add() here because pgshift is resolved here
if (rusage_memory_stat_add(args->range, phys,
tbl.pgsize,
tbl.pgsize)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__, base, phys,
tbl.pgsize, tbl.pgsize);
} else {
dkprintf("%s: !calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
__func__, base, phys,
tbl.pgsize, tbl.pgsize);
if (!(args->attr[level-1] & PTE_CONT)) {
if (rusage_memory_stat_add(args->range,
phys,
tbl.pgsize,
tbl.pgsize)) {
dkprintf("%lx+,%s: calling memory_stat_rss_add(),base=%lx,phys=%lx,size=%ld,pgsize=%ld\n",
phys, __func__, base,
phys,
tbl.pgsize,
tbl.pgsize);
}
}
goto out;
}
@ -2848,7 +2876,7 @@ retry:
error = 0;
out:
if (tt_pa) {
ihk_mc_free_pages(tt_pa, 1);
ihk_mc_free_pages(phys_to_virt((unsigned long)tt_pa), 1);
}
dkprintf("set_range_middle(%lx,%lx,%lx,%d): %d %lx\n",
base, start, end, level, error, *ptep);
@ -3200,6 +3228,7 @@ void load_page_table(struct page_table *pt)
{
if (pt == NULL) {
// load page table for idle(EL1) process.
switch_mm(init_pt);
return;
}
// load page table for user(EL0) thread.
@ -3259,7 +3288,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
attr |= PTATTR_UNCACHABLE;
}
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
paligned, v, npages);
pt = get_init_page_table();
@ -3335,15 +3364,15 @@ unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;
if (MAP_KERNEL_START <= va) {
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
if (va >= MAP_ST_START) {
return va - MAP_ST_START + arm64_st_phys_base;
}
return va - MAP_ST_START;
return va - MAP_KERNEL_START + arm64_kernel_phys_base;
}
void *phys_to_virt(unsigned long p)
{
return (void *)(p | MAP_ST_START);
return (void *)((p - arm64_st_phys_base) | MAP_ST_START);
}
int copy_from_user(void *dst, const void *src, size_t siz)
@ -3716,44 +3745,6 @@ translation_table_t* get_translation_table_as_paddr(const struct page_table *pt)
return pt->tt_pa;
}
#ifdef POSTK_DEBUG_ARCH_DEP_8
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id)
{
unsigned long cpu;
cpu_set_t _cpu_set;
int flush_ind;
if (addr) {
flush_ind = (addr >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
}
/* Zero address denotes full TLB flush */
else {
/* Random.. */
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
}
/* Take a copy of the cpu set so that we don't hold the lock
* all the way while interrupting other cores */
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
/* Loop through CPUs in this address space and interrupt them for
* TLB flush on the specified address */
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
if (ihk_mc_get_processor_id() == cpu)
continue;
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
flush_ind, addr, cpu);
ihk_mc_interrupt_cpu(cpu,
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
}
}
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
void arch_adjust_allocate_page_size(struct page_table *pt,
uintptr_t fault_addr,
pte_t *ptep,

View File

@ -19,7 +19,7 @@ int ihk_mc_ikc_init_first_local(struct ihk_ikc_channel_desc *channel,
memset(channel, 0, sizeof(struct ihk_ikc_channel_desc));
mikc_queue_pages = ((4 * num_processors * MASTER_IKCQ_PKTSIZE)
mikc_queue_pages = ((8 * num_processors * MASTER_IKCQ_PKTSIZE)
+ (PAGE_SIZE - 1)) / PAGE_SIZE;
/* Place both sides in this side */

View File

@ -8,6 +8,7 @@
#include <string.h>
#include <ihk/mm.h>
#include <irq.h>
#include <process.h>
/*
* @ref.impl arch/arm64/kernel/perf_event.c
@ -85,25 +86,17 @@ void arm64_disable_user_access_pmu_regs(void)
cpu_pmu.disable_user_access_pmu_regs();
}
extern unsigned int *arm64_march_perfmap;
static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, int mode)
{
int ret = -1;
unsigned long config_base = 0;
int mapping;
mapping = cpu_pmu.map_event(type, config);
if (mapping < 0) {
return mapping;
}
ret = cpu_pmu.disable_counter(counter);
ret = cpu_pmu.disable_counter(1UL << counter);
if (ret < 0) {
return ret;
}
ret = cpu_pmu.enable_intens(counter);
ret = cpu_pmu.enable_intens(1UL << counter);
if (ret < 0) {
return ret;
}
@ -112,7 +105,7 @@ static int __ihk_mc_perfctr_init(int counter, uint32_t type, uint64_t config, in
if (ret) {
return ret;
}
config_base |= (unsigned long)mapping;
config_base |= config;
cpu_pmu.write_evtype(counter, config_base);
return ret;
}
@ -124,68 +117,24 @@ int ihk_mc_perfctr_init_raw(int counter, uint64_t config, int mode)
return ret;
}
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
{
int ret;
ret = __ihk_mc_perfctr_init(counter, PERF_TYPE_RAW, config, mode);
return ret;
}
int ihk_mc_perfctr_start(unsigned long counter_mask)
{
int ret = 0, i;
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
if (counter_mask & (1UL << i)) {
ret = cpu_pmu.enable_counter(i);
if (ret < 0) {
kprintf("%s: enable failed(idx=%d)\n",
__func__, i);
break;
}
}
}
return ret;
return cpu_pmu.enable_counter(counter_mask);
}
int ihk_mc_perfctr_stop(unsigned long counter_mask, int flags)
{
int i = 0;
for (i = 0; i < sizeof(counter_mask) * BITS_PER_BYTE; i++) {
if (!(counter_mask & (1UL << i)))
continue;
int ret = 0;
ret = cpu_pmu.disable_counter(i);
if (ret < 0) {
continue;
}
if (flags & IHK_MC_PERFCTR_DISABLE_INTERRUPT) {
// when ihk_mc_perfctr_start is called,
// ihk_mc_perfctr_init is also called so disable
// interrupt
ret = cpu_pmu.disable_intens(i);
if (ret < 0) {
continue;
}
}
}
return 0;
return cpu_pmu.disable_counter(counter_mask);
}
int ihk_mc_perfctr_reset(int counter)
{
// TODO[PMU]: ihk_mc_perfctr_setと同様にサンプリングレートの共通部実装の扱いを見てから本実装。
cpu_pmu.write_counter(counter, 0);
return 0;
}
int ihk_mc_perfctr_set(int counter, long val)
{
// TODO[PMU]: 共通部でサンプリングレートの計算をして、設定するカウンタ値をvalに渡してくるようになると想定。サンプリングレートの扱いを見てから本実装。
uint32_t v = val;
cpu_pmu.write_counter(counter, v);
return 0;
@ -198,6 +147,15 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
return 0;
}
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
{
const int counters = ihk_mc_perf_get_num_counters();
return cpu_pmu.get_event_idx(counters,
thread->pmc_alloc_map,
event->hw_config);
}
unsigned long ihk_mc_perfctr_read(int counter)
{
unsigned long count;
@ -205,6 +163,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
return count;
}
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
{
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
count &= ((1UL << 32) - 1);
return count;
}
int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
unsigned long pmc_status)
{
@ -234,12 +200,14 @@ int ihk_mc_perfctr_alloc_counter(unsigned int *type, unsigned long *config,
int ihk_mc_perf_counter_mask_check(unsigned long counter_mask)
{
return 1;
return cpu_pmu.counter_mask_valid(counter_mask);
}
int ihk_mc_perf_get_num_counters(void)
{
return cpu_pmu.per_cpu[ihk_mc_get_processor_id()].num_events;
const struct per_cpu_arm_pmu *per_cpu_arm_pmu = get_per_cpu_pmu();
return per_cpu_arm_pmu->num_events;
}
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
@ -247,3 +215,83 @@ int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
/* Nothing to do. */
return 0;
}
static inline uint64_t arm_pmu_event_max_period(struct mc_perf_event *event)
{
return 0xFFFFFFFF;
}
int hw_perf_event_init(struct mc_perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
if (!is_sampling_event(event)) {
hwc->sample_period = arm_pmu_event_max_period(event) >> 1;
hwc->last_period = hwc->sample_period;
ihk_atomic64_set(&hwc->period_left, hwc->sample_period);
}
return 0;
}
int ihk_mc_event_set_period(struct mc_perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int64_t left = ihk_atomic64_read(&hwc->period_left);
int64_t period = hwc->sample_period;
uint64_t max_period;
int ret = 0;
max_period = arm_pmu_event_max_period(event);
if (unlikely(left <= -period)) {
left = period;
ihk_atomic64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
if (unlikely(left <= 0)) {
left += period;
ihk_atomic64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
/*
* Limit the maximum period to prevent the counter value
* from overtaking the one we are about to program. In
* effect we are reducing max_period to account for
* interrupt latency (and we are being very conservative).
*/
if (left > (max_period >> 1))
left = (max_period >> 1);
ihk_atomic64_set(&hwc->prev_count, (uint64_t)-left);
cpu_pmu.write_counter(event->counter_id,
(uint64_t)(-left) & max_period);
return ret;
}
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
int64_t delta;
uint64_t prev_raw_count, new_raw_count;
uint64_t max_period = arm_pmu_event_max_period(event);
again:
prev_raw_count = ihk_atomic64_read(&hwc->prev_count);
new_raw_count = cpu_pmu.read_counter(event->counter_id);
if (ihk_atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
goto again;
delta = (new_raw_count - prev_raw_count) & max_period;
ihk_atomic64_add(delta, &event->count);
ihk_atomic64_add(-delta, &hwc->period_left);
return new_raw_count;
}

View File

@ -4,7 +4,6 @@
#include <ihk/perfctr.h>
#include <errno.h>
#include <ihk/debug.h>
#include <debug.h>
#include <sysreg.h>
#include <virt.h>
#include <bitops.h>
@ -21,29 +20,174 @@
#define DDEBUG_DEFAULT DDEBUG_PRINT
#endif
/*
* read pmevcntr<n>_el0 functions
*/
#define read_pmevcntrN_el0(N) \
static uint32_t read_pmevcntr##N##_el0(void) \
{ \
return read_sysreg(pmevcntr##N##_el0); \
}
read_pmevcntrN_el0(0)
read_pmevcntrN_el0(1)
read_pmevcntrN_el0(2)
read_pmevcntrN_el0(3)
read_pmevcntrN_el0(4)
read_pmevcntrN_el0(5)
read_pmevcntrN_el0(6)
read_pmevcntrN_el0(7)
read_pmevcntrN_el0(8)
read_pmevcntrN_el0(9)
read_pmevcntrN_el0(10)
read_pmevcntrN_el0(11)
read_pmevcntrN_el0(12)
read_pmevcntrN_el0(13)
read_pmevcntrN_el0(14)
read_pmevcntrN_el0(15)
read_pmevcntrN_el0(16)
read_pmevcntrN_el0(17)
read_pmevcntrN_el0(18)
read_pmevcntrN_el0(19)
read_pmevcntrN_el0(20)
read_pmevcntrN_el0(21)
read_pmevcntrN_el0(22)
read_pmevcntrN_el0(23)
read_pmevcntrN_el0(24)
read_pmevcntrN_el0(25)
read_pmevcntrN_el0(26)
read_pmevcntrN_el0(27)
read_pmevcntrN_el0(28)
read_pmevcntrN_el0(29)
read_pmevcntrN_el0(30)
static uint32_t (* const read_pmevcntr_el0[])(void) = {
read_pmevcntr0_el0, read_pmevcntr1_el0, read_pmevcntr2_el0,
read_pmevcntr3_el0, read_pmevcntr4_el0, read_pmevcntr5_el0,
read_pmevcntr6_el0, read_pmevcntr7_el0, read_pmevcntr8_el0,
read_pmevcntr9_el0, read_pmevcntr10_el0, read_pmevcntr11_el0,
read_pmevcntr12_el0, read_pmevcntr13_el0, read_pmevcntr14_el0,
read_pmevcntr15_el0, read_pmevcntr16_el0, read_pmevcntr17_el0,
read_pmevcntr18_el0, read_pmevcntr19_el0, read_pmevcntr20_el0,
read_pmevcntr21_el0, read_pmevcntr22_el0, read_pmevcntr23_el0,
read_pmevcntr24_el0, read_pmevcntr25_el0, read_pmevcntr26_el0,
read_pmevcntr27_el0, read_pmevcntr28_el0, read_pmevcntr29_el0,
read_pmevcntr30_el0,
};
/*
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
* Perf Events' indices
* write pmevcntr<n>_el0 functions
*/
#define ARMV8_IDX_CYCLE_COUNTER 0
#define ARMV8_IDX_COUNTER0 1
#define ARMV8_IDX_COUNTER_LAST (ARMV8_IDX_CYCLE_COUNTER + get_per_cpu_pmu()->num_events - 1)
#define write_pmevcntrN_el0(N) \
static void write_pmevcntr##N##_el0(uint32_t v) \
{ \
write_sysreg(v, pmevcntr##N##_el0); \
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h */
#define ARMV8_PMU_MAX_COUNTERS 32
#define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1)
write_pmevcntrN_el0(0)
write_pmevcntrN_el0(1)
write_pmevcntrN_el0(2)
write_pmevcntrN_el0(3)
write_pmevcntrN_el0(4)
write_pmevcntrN_el0(5)
write_pmevcntrN_el0(6)
write_pmevcntrN_el0(7)
write_pmevcntrN_el0(8)
write_pmevcntrN_el0(9)
write_pmevcntrN_el0(10)
write_pmevcntrN_el0(11)
write_pmevcntrN_el0(12)
write_pmevcntrN_el0(13)
write_pmevcntrN_el0(14)
write_pmevcntrN_el0(15)
write_pmevcntrN_el0(16)
write_pmevcntrN_el0(17)
write_pmevcntrN_el0(18)
write_pmevcntrN_el0(19)
write_pmevcntrN_el0(20)
write_pmevcntrN_el0(21)
write_pmevcntrN_el0(22)
write_pmevcntrN_el0(23)
write_pmevcntrN_el0(24)
write_pmevcntrN_el0(25)
write_pmevcntrN_el0(26)
write_pmevcntrN_el0(27)
write_pmevcntrN_el0(28)
write_pmevcntrN_el0(29)
write_pmevcntrN_el0(30)
static void (* const write_pmevcntr_el0[])(uint32_t) = {
write_pmevcntr0_el0, write_pmevcntr1_el0, write_pmevcntr2_el0,
write_pmevcntr3_el0, write_pmevcntr4_el0, write_pmevcntr5_el0,
write_pmevcntr6_el0, write_pmevcntr7_el0, write_pmevcntr8_el0,
write_pmevcntr9_el0, write_pmevcntr10_el0, write_pmevcntr11_el0,
write_pmevcntr12_el0, write_pmevcntr13_el0, write_pmevcntr14_el0,
write_pmevcntr15_el0, write_pmevcntr16_el0, write_pmevcntr17_el0,
write_pmevcntr18_el0, write_pmevcntr19_el0, write_pmevcntr20_el0,
write_pmevcntr21_el0, write_pmevcntr22_el0, write_pmevcntr23_el0,
write_pmevcntr24_el0, write_pmevcntr25_el0, write_pmevcntr26_el0,
write_pmevcntr27_el0, write_pmevcntr28_el0, write_pmevcntr29_el0,
write_pmevcntr30_el0,
};
/*
* ARMv8 low level PMU access
* write pmevtyper<n>_el0 functions
*/
#define write_pmevtyperN_el0(N) \
static void write_pmevtyper##N##_el0(uint32_t v) \
{ \
write_sysreg(v, pmevtyper##N##_el0); \
}
/*
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
* Perf Event to low level counters mapping
*/
#define ARMV8_IDX_TO_COUNTER(x) \
(((x) - ARMV8_IDX_COUNTER0) & ARMV8_PMU_COUNTER_MASK)
write_pmevtyperN_el0(0)
write_pmevtyperN_el0(1)
write_pmevtyperN_el0(2)
write_pmevtyperN_el0(3)
write_pmevtyperN_el0(4)
write_pmevtyperN_el0(5)
write_pmevtyperN_el0(6)
write_pmevtyperN_el0(7)
write_pmevtyperN_el0(8)
write_pmevtyperN_el0(9)
write_pmevtyperN_el0(10)
write_pmevtyperN_el0(11)
write_pmevtyperN_el0(12)
write_pmevtyperN_el0(13)
write_pmevtyperN_el0(14)
write_pmevtyperN_el0(15)
write_pmevtyperN_el0(16)
write_pmevtyperN_el0(17)
write_pmevtyperN_el0(18)
write_pmevtyperN_el0(19)
write_pmevtyperN_el0(20)
write_pmevtyperN_el0(21)
write_pmevtyperN_el0(22)
write_pmevtyperN_el0(23)
write_pmevtyperN_el0(24)
write_pmevtyperN_el0(25)
write_pmevtyperN_el0(26)
write_pmevtyperN_el0(27)
write_pmevtyperN_el0(28)
write_pmevtyperN_el0(29)
write_pmevtyperN_el0(30)
static void (* const write_pmevtyper_el0[])(uint32_t) = {
write_pmevtyper0_el0, write_pmevtyper1_el0, write_pmevtyper2_el0,
write_pmevtyper3_el0, write_pmevtyper4_el0, write_pmevtyper5_el0,
write_pmevtyper6_el0, write_pmevtyper7_el0, write_pmevtyper8_el0,
write_pmevtyper9_el0, write_pmevtyper10_el0, write_pmevtyper11_el0,
write_pmevtyper12_el0, write_pmevtyper13_el0, write_pmevtyper14_el0,
write_pmevtyper15_el0, write_pmevtyper16_el0, write_pmevtyper17_el0,
write_pmevtyper18_el0, write_pmevtyper19_el0, write_pmevtyper20_el0,
write_pmevtyper21_el0, write_pmevtyper22_el0, write_pmevtyper23_el0,
write_pmevtyper24_el0, write_pmevtyper25_el0, write_pmevtyper26_el0,
write_pmevtyper27_el0, write_pmevtyper28_el0, write_pmevtyper29_el0,
write_pmevtyper30_el0,
};
#define ARMV8_IDX_CYCLE_COUNTER 31
#define ARMV8_IDX_COUNTER0 0
/*
* @ref.impl linux-v4.15-rc3 arch/arm64/include/asm/perf_event.h
@ -175,6 +319,10 @@
/* PMUv3 HW events mapping. */
/* disable -Woverride-init for the following initializations */
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverride-init"
/*
* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c
* ARMv8 Architectural defined events, not all of these may
@ -220,6 +368,9 @@ static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)] = ARMV8_PMUV3_PERFCTR_BR_MIS_PRED,
};
/* restore warnings */
#pragma GCC diagnostic pop
/* @ref.impl linux-v4.15-rc3 drivers/perf/arm_pmu.c */
static int
armpmu_map_cache_event(const unsigned (*cache_map)
@ -298,11 +449,25 @@ armpmu_map_event(uint32_t type, uint64_t config,
return -ENOENT;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_counter_mask_valid(unsigned long counter_mask)
{
int num;
unsigned long event;
unsigned long cycle;
unsigned long invalid_mask;
num = get_per_cpu_pmu()->num_events;
num--; /* Sub the CPU cycles counter */
event = ((1UL << num) - 1) << ARMV8_IDX_COUNTER0;
cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
invalid_mask = ~(event | cycle);
return !(counter_mask & invalid_mask);
}
static inline int armv8pmu_counter_valid(int idx)
{
return idx >= ARMV8_IDX_CYCLE_COUNTER &&
idx <= ARMV8_IDX_COUNTER_LAST;
return armv8pmu_counter_mask_valid(1UL << idx);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
@ -326,6 +491,11 @@ static inline int armv8pmu_has_overflowed(uint32_t pmovsr)
return pmovsr & ARMV8_PMU_OVERFLOWED_MASK;
}
static inline int armv8pmu_counter_has_overflowed(uint32_t pmnc, int idx)
{
return pmnc & BIT(idx);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static int __armv8_pmuv3_map_event(uint32_t type, uint64_t config,
const unsigned int (*extra_event_map)
@ -357,6 +527,23 @@ static int armv8_pmuv3_map_event(uint32_t type, uint64_t config)
return __armv8_pmuv3_map_event(type, config, NULL, NULL);
}
static int armv8_pmuv3_map_hw_event(uint64_t config)
{
return __armv8_pmuv3_map_event(PERF_TYPE_HARDWARE, config, NULL, NULL);
}
static int armv8_pmuv3_map_cache_event(uint64_t config)
{
return __armv8_pmuv3_map_event(PERF_TYPE_HW_CACHE, config, NULL, NULL);
}
static int armv8_pmuv3_map_raw_event(uint64_t config)
{
return __armv8_pmuv3_map_event(PERF_TYPE_RAW, config, NULL, NULL);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline uint32_t armv8pmu_pmcr_read(void)
{
@ -371,24 +558,6 @@ static inline void armv8pmu_pmcr_write(uint32_t val)
write_sysreg(val, pmcr_el0);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_select_counter(int idx)
{
uint32_t counter;
if (!armv8pmu_counter_valid(idx)) {
ekprintf("%s: The count_register#%d is not implemented.\n",
__func__, idx);
return -EINVAL;
}
counter = ARMV8_IDX_TO_COUNTER(idx);
write_sysreg(counter, pmselr_el0);
isb();
return idx;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline uint32_t armv8pmu_read_counter(int idx)
{
@ -401,8 +570,8 @@ static inline uint32_t armv8pmu_read_counter(int idx)
else if (idx == ARMV8_IDX_CYCLE_COUNTER) {
value = read_sysreg(pmccntr_el0);
}
else if (armv8pmu_select_counter(idx) == idx) {
value = read_sysreg(pmxevcntr_el0);
else {
value = read_pmevcntr_el0[idx]();
}
return value;
@ -421,43 +590,42 @@ static inline void armv8pmu_write_counter(int idx, uint32_t value)
* count using the lower 32bits and we want an interrupt when
* it overflows.
*/
uint64_t value64 = 0xffffffff00000000ULL | value;
uint64_t value64 = (int32_t)value;
write_sysreg(value64, pmccntr_el0);
}
else if (armv8pmu_select_counter(idx) == idx) {
write_sysreg(value, pmxevcntr_el0);
else {
write_pmevcntr_el0[idx](value);
}
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_enable_intens(int idx)
static inline int armv8pmu_enable_intens(unsigned long counter_mask)
{
uint32_t counter;
if (!armv8pmu_counter_valid(idx)) {
ekprintf("%s: The count_register#%d is not implemented.\n",
__func__, idx);
if (!armv8pmu_counter_mask_valid(counter_mask)) {
ekprintf("%s: invalid counter mask(%#lx)\n",
__func__, counter_mask);
return -EINVAL;
}
counter = ARMV8_IDX_TO_COUNTER(idx);
write_sysreg(BIT(counter), pmintenset_el1);
return idx;
write_sysreg(counter_mask, pmintenset_el1);
return 0;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_disable_intens(int idx)
static inline int armv8pmu_disable_intens(unsigned long counter_mask)
{
uint32_t counter = ARMV8_IDX_TO_COUNTER(idx);
write_sysreg(BIT(counter), pmintenclr_el1);
if (!armv8pmu_counter_mask_valid(counter_mask)) {
ekprintf("%s: invalid counter mask(%#lx)\n",
__func__, counter_mask);
return -EINVAL;
}
write_sysreg(counter_mask, pmintenclr_el1);
isb();
/* Clear the overflow flag in case an interrupt is pending. */
write_sysreg(BIT(counter), pmovsclr_el0);
write_sysreg(counter_mask, pmovsclr_el0);
isb();
return idx;
return 0;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
@ -492,42 +660,37 @@ static int armv8pmu_set_event_filter(unsigned long *config_base, int mode)
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline void armv8pmu_write_evtype(int idx, uint32_t val)
{
if (armv8pmu_select_counter(idx) == idx) {
val &= ARMV8_PMU_EVTYPE_MASK;
write_sysreg(val, pmxevtyper_el0);
if (!armv8pmu_counter_valid(idx)) {
ekprintf("%s: The count_register#%d is not implemented.\n",
__func__, idx);
return;
} else if (idx != ARMV8_IDX_CYCLE_COUNTER) {
write_pmevtyper_el0[idx](val);
}
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_enable_counter(int idx)
static inline int armv8pmu_enable_counter(unsigned long counter_mask)
{
uint32_t counter;
if (!armv8pmu_counter_valid(idx)) {
ekprintf("%s: The count_register#%d is not implemented.\n",
__func__, idx);
if (!armv8pmu_counter_mask_valid(counter_mask)) {
ekprintf("%s: invalid counter mask 0x%lx.\n",
__func__, counter_mask);
return -EINVAL;
}
counter = ARMV8_IDX_TO_COUNTER(idx);
write_sysreg(BIT(counter), pmcntenset_el0);
return idx;
write_sysreg(counter_mask, pmcntenset_el0);
return 0;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static inline int armv8pmu_disable_counter(int idx)
static inline int armv8pmu_disable_counter(unsigned long counter_mask)
{
uint32_t counter;
if (!armv8pmu_counter_valid(idx)) {
ekprintf("%s: The count_register#%d is not implemented.\n",
__func__, idx);
if (!armv8pmu_counter_mask_valid(counter_mask)) {
ekprintf("%s: invalid counter mask 0x%lx.\n",
__func__, counter_mask);
return -EINVAL;
}
counter = ARMV8_IDX_TO_COUNTER(idx);
write_sysreg(BIT(counter), pmcntenclr_el0);
return idx;
write_sysreg(counter_mask, pmcntenclr_el0);
return 0;
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
@ -555,41 +718,20 @@ static void armv8pmu_stop(void)
ihk_mc_spinlock_unlock(&pmu_lock, flags);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static void armv8pmu_disable_event(int idx)
{
unsigned long flags;
/*
* Disable counter and interrupt
*/
flags = ihk_mc_spinlock_lock(&pmu_lock);
/*
* Disable counter
*/
armv8pmu_disable_counter(idx);
/*
* Disable interrupt for this counter
*/
armv8pmu_disable_intens(idx);
ihk_mc_spinlock_unlock(&pmu_lock, flags);
}
/* @ref.impl linux-v4.15-rc3 arch/arm64/kernel/perf_event.c */
static void armv8pmu_reset(void *info)
{
struct arm_pmu *cpu_pmu = (struct arm_pmu *)info;
uint32_t idx, nb_cnt =
uint32_t nb_cnt =
cpu_pmu->per_cpu[ihk_mc_get_processor_id()].num_events;
nb_cnt--; /* Sub the CPU cycles counter */
unsigned long event = ((1UL << nb_cnt) - 1) << ARMV8_IDX_COUNTER0;
unsigned long cycle = 1UL << ARMV8_IDX_CYCLE_COUNTER;
unsigned long valid_mask = event | cycle;
/* The counter and interrupt enable registers are unknown at reset. */
for (idx = ARMV8_IDX_CYCLE_COUNTER; idx < nb_cnt; ++idx) {
armv8pmu_disable_counter(idx);
armv8pmu_disable_intens(idx);
}
armv8pmu_disable_counter(valid_mask);
armv8pmu_disable_intens(valid_mask);
/*
* Initialize & Reset PMNC. Request overflow interrupt for
@ -603,7 +745,7 @@ static void armv8pmu_reset(void *info)
static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
unsigned long config)
{
int idx;
int idx, end;
unsigned long evtype = config & ARMV8_PMU_EVTYPE_EVENT;
/* Always prefer to place a cycle counter into the cycle counter. */
@ -615,7 +757,9 @@ static int armv8pmu_get_event_idx(int num_events, unsigned long used_mask,
/*
* Otherwise use events counters
*/
for (idx = ARMV8_IDX_COUNTER0; idx < num_events; ++idx) {
end = ARMV8_IDX_COUNTER0 + num_events;
end--; /* Sub the CPU cycles counter */
for (idx = ARMV8_IDX_COUNTER0; idx < end; ++idx) {
if (!(used_mask & (1UL << idx)))
return idx;
}
@ -642,13 +786,11 @@ static uint32_t armv8pmu_read_num_pmnc_events(void)
static void armv8pmu_handle_irq(void *priv)
{
struct siginfo info;
uint32_t pmovsr;
struct thread *thread = cpu_local_var(current);
struct process *proc = thread->proc;
long irqstate;
struct mckfd *fdp;
struct pt_regs *regs = (struct pt_regs *)priv;
const struct per_cpu_arm_pmu *cpu_pmu = get_per_cpu_pmu();
int idx;
/*
* Get and reset the IRQ flags
@ -661,27 +803,40 @@ static void armv8pmu_handle_irq(void *priv)
if (!armv8pmu_has_overflowed(pmovsr))
return;
if (!proc->monitoring_event) {
return;
}
/*
* Handle the counter(s) overflow(s)
*/
/* same as x86_64 mckernel */
irqstate = ihk_mc_spinlock_lock(&proc->mckfd_lock);
for (fdp = proc->mckfd; fdp; fdp = fdp->next) {
if (fdp->sig_no > 0)
break;
}
ihk_mc_spinlock_unlock(&proc->mckfd_lock, irqstate);
for (idx = 0; idx < cpu_pmu->num_events; idx++) {
struct mc_perf_event *event = NULL;
struct mc_perf_event *sub;
if (fdp) {
memset(&info, '\0', sizeof(info));
info.si_signo = fdp->sig_no;
info._sifields._sigfault.si_addr = (void *)regs->pc;
info._sifields._sigpoll.si_fd = fdp->fd;
set_signal(fdp->sig_no, regs, &info);
}
else {
set_signal(SIGIO, regs, NULL);
if (!armv8pmu_counter_has_overflowed(pmovsr, idx)) {
continue;
}
if (proc->monitoring_event->counter_id == idx) {
event = proc->monitoring_event;
} else {
list_for_each_entry(sub,
&proc->monitoring_event->sibling_list,
group_entry) {
if (sub->counter_id == idx) {
event = sub;
break;
}
}
}
if (!event) {
continue;
}
ihk_mc_event_update(event);
ihk_mc_event_set_period(event);
}
return;
}
static void armv8pmu_enable_user_access_pmu_regs(void)
@ -735,11 +890,15 @@ int armv8pmu_init(struct arm_pmu* cpu_pmu)
cpu_pmu->write_evtype = armv8pmu_write_evtype;
cpu_pmu->get_event_idx = armv8pmu_get_event_idx;
cpu_pmu->map_event = armv8_pmuv3_map_event;
cpu_pmu->map_hw_event = armv8_pmuv3_map_hw_event;
cpu_pmu->map_cache_event = armv8_pmuv3_map_cache_event;
cpu_pmu->map_raw_event = armv8_pmuv3_map_raw_event;
cpu_pmu->enable_user_access_pmu_regs =
armv8pmu_enable_user_access_pmu_regs;
cpu_pmu->disable_user_access_pmu_regs =
armv8pmu_disable_user_access_pmu_regs;
cpu_pmu->handler = &armv8pmu_handler;
cpu_pmu->counter_mask_valid = &armv8pmu_counter_mask_valid;
return 0;
}

View File

@ -18,10 +18,9 @@
#include <psci.h>
#include <errno.h>
#include <ihk/types.h>
#include <ihk/debug.h>
#include <compiler.h>
#include <lwk/compiler.h>
#include <debug.h>
#include <ihk/debug.h>
//#define DEBUG_PRINT_PSCI

View File

@ -1,4 +1,4 @@
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2018 */
/* ptrace.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
#include <errno.h>
#include <debug-monitors.h>
#include <hw_breakpoint.h>
@ -11,7 +11,8 @@
#include <hwcap.h>
#include <string.h>
#include <thread_info.h>
#include <debug.h>
#include <ptrace.h>
#include <ihk/debug.h>
//#define DEBUG_PRINT_SC
@ -25,37 +26,6 @@
extern void save_debugreg(unsigned long *debugreg);
extern int interrupt_from_user(void *);
enum aarch64_regset {
REGSET_GPR,
REGSET_FPR,
REGSET_TLS,
REGSET_HW_BREAK,
REGSET_HW_WATCH,
REGSET_SYSTEM_CALL,
#ifdef CONFIG_ARM64_SVE
REGSET_SVE,
#endif /* CONFIG_ARM64_SVE */
};
struct user_regset;
typedef long user_regset_get_fn(struct thread *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf);
typedef long user_regset_set_fn(struct thread *target,
const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf);
struct user_regset {
user_regset_get_fn *get;
user_regset_set_fn *set;
unsigned int n;
unsigned int size;
unsigned int core_note_type;
};
long ptrace_read_user(struct thread *thread, long addr, unsigned long *value)
{
return -EIO;
@ -273,6 +243,17 @@ static inline long copy_regset_from_user(struct thread *target,
return regset->set(target, regset, offset, size, NULL, data);
}
unsigned int regset_size(struct thread *target,
const struct user_regset *regset)
{
if (!regset->get_size) {
return regset->n * regset->size;
}
else {
return regset->get_size(target, regset);
}
}
/*
* Bits which are always architecturally RES0 per ARM DDI 0487A.h
* Userspace cannot use these until they have an architectural meaning.
@ -624,6 +605,48 @@ out:
#ifdef CONFIG_ARM64_SVE
static void sve_init_header_from_thread(struct user_sve_header *header,
struct thread *target)
{
unsigned int vq;
memset(header, 0, sizeof(*header));
/* McKernel processes always enable SVE. */
header->flags = SVE_PT_REGS_SVE;
if (target->ctx.thread->sve_flags & SVE_PT_VL_INHERIT) {
header->flags |= SVE_PT_VL_INHERIT;
}
header->vl = target->ctx.thread->sve_vl;
vq = sve_vq_from_vl(header->vl);
header->max_vl = sve_max_vl;
header->size = SVE_PT_SIZE(vq, header->flags);
header->max_size = SVE_PT_SIZE(sve_vq_from_vl(header->max_vl),
SVE_PT_REGS_SVE);
}
static unsigned int sve_size_from_header(struct user_sve_header const *header)
{
return ALIGN(header->size, SVE_VQ_BYTES);
}
static unsigned int sve_get_size(struct thread *target,
const struct user_regset *regset)
{
struct user_sve_header header;
/* Instead of system_supports_sve() */
if (unlikely(!(elf_hwcap & HWCAP_SVE))) {
return 0;
}
sve_init_header_from_thread(&header, target);
return sve_size_from_header(&header);
}
/* read NT_ARM_SVE */
static long sve_get(struct thread *target,
const struct user_regset *regset,
@ -646,23 +669,9 @@ static long sve_get(struct thread *target,
}
/* Header */
memset(&header, 0, sizeof(header));
header.vl = target->ctx.thread->sve_vl;
BUG_ON(!sve_vl_valid(header.vl));
sve_init_header_from_thread(&header, target);
vq = sve_vq_from_vl(header.vl);
BUG_ON(!sve_vl_valid(sve_max_vl));
header.max_vl = sve_max_vl;
/* McKernel processes always enable SVE. */
header.flags = SVE_PT_REGS_SVE;
header.size = SVE_PT_SIZE(vq, header.flags);
header.max_size = SVE_PT_SIZE(sve_vq_from_vl(header.max_vl),
SVE_PT_REGS_SVE);
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &header,
0, sizeof(header));
if (ret) {
@ -676,11 +685,9 @@ static long sve_get(struct thread *target,
*/
/* Otherwise: full SVE case */
start = SVE_PT_SVE_OFFSET;
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
BUG_ON(end < start);
BUG_ON(end - start > sve_state_size(target));
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
target->ctx.thread->sve_state,
start, end);
@ -690,24 +697,18 @@ static long sve_get(struct thread *target,
start = end;
end = SVE_PT_SVE_FPSR_OFFSET(vq);
BUG_ON(end < start);
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
start, end);
if (ret) {
goto out;
}
/*
* Copy fpsr, and fpcr which must follow contiguously in
* struct fpsimd_state:
*/
start = end;
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
(char *)&target->fp_regs->fpsr);
BUG_ON(end < start);
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
(char *)&target->fp_regs->fpsr !=
end - start);
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
&target->fp_regs->fpsr,
start, end);
@ -716,9 +717,7 @@ static long sve_get(struct thread *target,
}
start = end;
end = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16 * 16;
BUG_ON(end < start);
end = sve_size_from_header(&header);
ret = user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
start, end);
out:
@ -762,13 +761,12 @@ static long sve_set(struct thread *target,
* sve_set_vector_length(), which will also validate them for us:
*/
ret = sve_set_vector_length(target, header.vl,
header.flags & ~SVE_PT_REGS_MASK);
((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
if (ret) {
goto out;
}
/* Actual VL set may be less than the user asked for: */
BUG_ON(!sve_vl_valid(target->ctx.thread->sve_vl));
vq = sve_vq_from_vl(target->ctx.thread->sve_vl);
/* Registers: FPSIMD-only case */
@ -779,11 +777,19 @@ static long sve_set(struct thread *target,
}
/* Otherwise: full SVE case */
/*
* If setting a different VL from the requested VL and there is
* register data, the data layout will be wrong: don't even
* try to set the registers in this case.
*/
if (count && vq != sve_vq_from_vl(header.vl)) {
ret = -EIO;
goto out;
}
start = SVE_PT_SVE_OFFSET;
end = SVE_PT_SVE_FFR_OFFSET(vq) + SVE_PT_SVE_FFR_SIZE(vq);
BUG_ON(end < start);
BUG_ON(end - start > sve_state_size(target));
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
target->ctx.thread->sve_state,
start, end);
@ -793,27 +799,21 @@ static long sve_set(struct thread *target,
start = end;
end = SVE_PT_SVE_FPSR_OFFSET(vq);
BUG_ON(end < start);
ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
start, end);
if (ret) {
goto out;
}
/*
* Copy fpsr, and fpcr which must follow contiguously in
* struct fpsimd_state:
*/
start = end;
end = SVE_PT_SVE_FPCR_OFFSET(vq) + SVE_PT_SVE_FPCR_SIZE;
BUG_ON((char *)(&target->fp_regs->fpcr + 1) <
(char *)&target->fp_regs->fpsr);
BUG_ON(end < start);
BUG_ON((char *)(&target->fp_regs->fpcr + 1) -
(char *)&target->fp_regs->fpsr !=
end - start);
user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->fp_regs->fpsr,
start, end);
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
&target->fp_regs->fpsr,
start, end);
out:
return ret;
}
@ -825,8 +825,9 @@ static const struct user_regset aarch64_regsets[] = {
.core_note_type = NT_PRSTATUS,
.n = sizeof(struct user_pt_regs) / sizeof(uint64_t),
.size = sizeof(uint64_t),
.align = sizeof(uint64_t),
.get = gpr_get,
.set = gpr_set
.set = gpr_set,
},
[REGSET_FPR] = {
.core_note_type = NT_PRFPREG,
@ -836,56 +837,75 @@ static const struct user_regset aarch64_regsets[] = {
* fpcr are 32-bits wide.
*/
.size = sizeof(uint32_t),
.align = sizeof(uint32_t),
.get = fpr_get,
.set = fpr_set
.set = fpr_set,
},
[REGSET_TLS] = {
.core_note_type = NT_ARM_TLS,
.n = 1,
.size = sizeof(void *),
.align = sizeof(void *),
.get = tls_get,
.set = tls_set
.set = tls_set,
},
[REGSET_HW_BREAK] = {
.core_note_type = NT_ARM_HW_BREAK,
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
.size = sizeof(uint32_t),
.align = sizeof(uint32_t),
.get = hw_break_get,
.set = hw_break_set
.set = hw_break_set,
},
[REGSET_HW_WATCH] = {
.core_note_type = NT_ARM_HW_WATCH,
.n = sizeof(struct user_hwdebug_state) / sizeof(uint32_t),
.size = sizeof(uint32_t),
.align = sizeof(uint32_t),
.get = hw_break_get,
.set = hw_break_set
.set = hw_break_set,
},
[REGSET_SYSTEM_CALL] = {
.core_note_type = NT_ARM_SYSTEM_CALL,
.n = 1,
.size = sizeof(int),
.align = sizeof(int),
.get = system_call_get,
.set = system_call_set
.set = system_call_set,
},
#ifdef CONFIG_ARM64_SVE
[REGSET_SVE] = { /* Scalable Vector Extension */
.core_note_type = NT_ARM_SVE,
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) + 15) / 16,
.size = 16,
.n = (SVE_PT_SIZE(SVE_VQ_MAX, SVE_PT_REGS_SVE) +
(SVE_VQ_BYTES - 1)) / SVE_VQ_BYTES,
.size = SVE_VQ_BYTES,
.align = SVE_VQ_BYTES,
.get = sve_get,
.set = sve_set
.set = sve_set,
.get_size = sve_get_size,
},
#endif /* CONFIG_ARM64_SVE */
};
static const struct user_regset *
find_regset(const struct user_regset *regset, unsigned int type, int n)
static const struct user_regset_view user_aarch64_view = {
.name = "aarch64", .e_machine = EM_AARCH64,
.regsets = aarch64_regsets,
.n = sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0])
};
const struct user_regset_view *current_user_regset_view(void)
{
return &user_aarch64_view;
}
const struct user_regset *find_regset(const struct user_regset_view *view,
unsigned int type)
{
int i = 0;
for (i = 0; i < n; i++) {
if (regset[i].core_note_type == type) {
return &regset[i];
for (i = 0; i < view->n; i++) {
if (view->regsets[i].core_note_type == type) {
return &view->regsets[i];
}
}
return NULL;
@ -894,8 +914,8 @@ find_regset(const struct user_regset *regset, unsigned int type, int n)
static long ptrace_regset(struct thread *thread, int req, long type, struct iovec *iov)
{
long rc = -EINVAL;
const struct user_regset *regset = find_regset(aarch64_regsets, type,
sizeof(aarch64_regsets) / sizeof(aarch64_regsets[0]));
const struct user_regset *regset =
find_regset(&user_aarch64_view, type);
if (!regset) {
kprintf("%s: not supported type 0x%x\n", __FUNCTION__, type);
@ -944,6 +964,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
/* save thread_info, if called by ptrace_report_exec() */
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
memcpy(&tinfo, thread->ctx.thread, sizeof(struct thread_info));
thread->uctx->user_regs.regs[0] = 0;
}
mcs_rwlock_writer_lock(&proc->update_lock, &lock);
@ -956,6 +977,13 @@ void ptrace_report_signal(struct thread *thread, int sig)
thread->exit_status = sig;
thread->status = PS_TRACED;
thread->ptrace &= ~PT_TRACE_SYSCALL;
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8))) &&
thread->ptrace & PTRACE_O_TRACEEXEC) {
/* PTRACE_O_TRACEEXEC: since Linux 3.0, the former
* thread ID can be retrieved with PTRACE_GETEVENTMSG.
* Report no change. */
thread->ptrace_eventmsg = thread->tid;
}
save_debugreg(thread->ptrace_debugreg);
if (sig == SIGSTOP || sig == SIGTSTP ||
sig == SIGTTIN || sig == SIGTTOU) {
@ -991,6 +1019,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
if (sig == ((SIGTRAP | (PTRACE_EVENT_EXEC << 8)))) {
memcpy(thread->ctx.thread, &tinfo, sizeof(struct thread_info));
}
arch_flush_icache_all();
}
long

View File

@ -1,4 +1,4 @@
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2018 */
/* syscall.c COPYRIGHT FUJITSU LIMITED 2015-2019 */
#include <cpulocal.h>
#include <string.h>
#include <kmalloc.h>
@ -15,7 +15,8 @@
#include <limits.h>
#include <uio.h>
#include <syscall.h>
#include <debug.h>
#include <rusage_private.h>
#include <ihk/debug.h>
void terminate_mcexec(int, int);
extern void ptrace_report_signal(struct thread *thread, int sig);
@ -42,7 +43,7 @@ uintptr_t debug_constants[] = {
offsetof(struct cpu_local_var, runq),
offsetof(struct cpu_local_var, status),
offsetof(struct cpu_local_var, idle),
offsetof(struct thread, ctx) + offsetof(struct thread_info, cpu_context),
offsetof(struct thread, ctx),
offsetof(struct thread, sched_list),
offsetof(struct thread, proc),
offsetof(struct thread, status),
@ -56,13 +57,34 @@ extern int num_processors;
int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
{
int min_queue_len = -1;
int cpu, min_cpu = -1, uti_cpu = -1;
unsigned long irqstate;
int cpu, min_cpu = -1;
#if 0
int uti_cpu = -1;
#endif
unsigned long irqstate = 0;
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
int start, end, step;
if (use_last) {
start = num_processors - 1;
end = -1;
step = -1;
}
else {
start = 0;
end = num_processors;
step = 1;
}
if (!cpu_local_var(current)->proc->nr_processes) {
irqstate = ihk_mc_spinlock_lock(&runq_reservation_lock);
}
else {
irqstate = cpu_disable_interrupt_save();
}
/* Find the first allowed core with the shortest run queue */
for (cpu = 0; cpu < num_processors; ++cpu) {
for (cpu = start; cpu != end; cpu += step) {
struct cpu_local_var *v;
if (!CPU_ISSET(cpu, cpu_set))
@ -73,11 +95,14 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d\n",
__func__, cpu, v->runq_len, v->runq_reserved);
if (min_queue_len == -1 ||
v->runq_len + v->runq_reserved < min_queue_len) {
min_queue_len = v->runq_len + v->runq_reserved;
//v->runq_len + v->runq_reserved < min_queue_len) {
v->runq_len < min_queue_len) {
//min_queue_len = v->runq_len + v->runq_reserved;
min_queue_len = v->runq_len;
min_cpu = cpu;
}
#if 0
/* Record the last tie CPU */
if (min_cpu != cpu &&
v->runq_len + v->runq_reserved == min_queue_len) {
@ -86,14 +111,15 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
dkprintf("%s: cpu=%d,runq_len=%d,runq_reserved=%d,min_cpu=%d,uti_cpu=%d\n",
__func__, cpu, v->runq_len, v->runq_reserved,
min_cpu, uti_cpu);
#else
ihk_mc_spinlock_unlock_noirq(&v->runq_lock);
#if 0
if (min_queue_len == 0)
break;
#endif
}
#if 0
min_cpu = use_last ? uti_cpu : min_cpu;
if (min_cpu != -1) {
if (get_cpu_local_var(min_cpu)->status != CPU_STATUS_RESERVED)
@ -102,22 +128,20 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last)
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved,
1);
}
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
#else
__sync_fetch_and_add(&get_cpu_local_var(min_cpu)->runq_reserved, 1);
#endif
if (!cpu_local_var(current)->proc->nr_processes) {
ihk_mc_spinlock_unlock(&runq_reservation_lock, irqstate);
}
else {
cpu_restore_interrupt(irqstate);
}
return min_cpu;
}
int
arch_clear_host_user_space()
{
struct thread *th = cpu_local_var(current);
/* XXX: might be unnecessary */
clear_host_pte(th->vm->region.user_start,
(th->vm->region.user_end - th->vm->region.user_start));
return 0;
}
/* archtecture-depended syscall handlers */
extern unsigned long do_fork(int clone_flags, unsigned long newsp,
unsigned long parent_tidptr, unsigned long child_tidptr,
@ -126,10 +150,18 @@ extern unsigned long do_fork(int clone_flags, unsigned long newsp,
SYSCALL_DECLARE(clone)
{
struct process *proc = cpu_local_var(current)->proc;
struct mcs_rwlock_node_irqsave lock_dump;
unsigned long ret;
/* mutex coredump */
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
if ((int)ihk_mc_syscall_arg0(ctx) & CLONE_VFORK) {
return do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
ret = do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0,
ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
} else {
return do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), /* clone_flags */
ihk_mc_syscall_arg1(ctx), /* newsp */
ihk_mc_syscall_arg2(ctx), /* parent_tidptr */
ihk_mc_syscall_arg4(ctx), /* child_tidptr (swap arg3) */
@ -137,33 +169,9 @@ SYSCALL_DECLARE(clone)
ihk_mc_syscall_pc(ctx), /* curpc */
ihk_mc_syscall_sp(ctx)); /* cursp */
}
}
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
SYSCALL_DECLARE(rt_sigaction)
{
int sig = ihk_mc_syscall_arg0(ctx);
const struct sigaction *act = (const struct sigaction *)ihk_mc_syscall_arg1(ctx);
struct sigaction *oact = (struct sigaction *)ihk_mc_syscall_arg2(ctx);
size_t sigsetsize = ihk_mc_syscall_arg3(ctx);
struct k_sigaction new_sa, old_sa;
int rc;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if(act)
if(copy_from_user(&new_sa.sa, act, sizeof new_sa.sa)){
goto fault;
}
rc = do_sigaction(sig, act? &new_sa: NULL, oact? &old_sa: NULL);
if(rc == 0 && oact)
if(copy_to_user(oact, &old_sa.sa, sizeof old_sa.sa)){
goto fault;
}
return rc;
fault:
return -EFAULT;
return ret;
}
SYSCALL_DECLARE(prctl)
@ -178,11 +186,10 @@ SYSCALL_DECLARE(prctl)
switch (option) {
case PR_SVE_SET_VL:
error = SVE_SET_VL(cpu_local_var(current),
ihk_mc_syscall_arg1(ctx), ihk_mc_syscall_arg2(ctx));
error = SVE_SET_VL(ihk_mc_syscall_arg1(ctx));
break;
case PR_SVE_GET_VL:
error = SVE_GET_VL(cpu_local_var(current));
error = SVE_GET_VL();
break;
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5) {
@ -657,7 +664,7 @@ void set_single_step(struct thread *thread)
set_regs_spsr_ss(thread->uctx);
}
extern void coredump(struct thread *thread, void *regs);
extern int coredump(struct thread *thread, void *regs, int sig);
static int
isrestart(int syscallno, unsigned long rc, int sig, int restart)
@ -1096,6 +1103,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
struct mcs_rwlock_node_irqsave lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
int restart = 0;
int ret;
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
@ -1270,15 +1278,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
dkprintf("SIGTRAP(): woken up\n");
break;
case SIGCONT:
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_CONTINUED;
info._sifields._sigchld.si_pid = proc->pid;
info._sifields._sigchld.si_status = 0x0000ffff;
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
proc->status = PS_RUNNING;
dkprintf("do_signal,SIGCONT,do nothing\n");
break;
case SIGQUIT:
case SIGILL:
@ -1290,9 +1289,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
case SIGXCPU:
case SIGXFSZ:
core:
dkprintf("do_signal,default,core,sig=%d\n", sig);
coredump(thread, regs);
coredumped = 0x80;
thread->coredump_regs =
kmalloc(sizeof(struct pt_regs),
IHK_MC_AP_NOWAIT);
if (!thread->coredump_regs) {
kprintf("%s: Out of memory\n", __func__);
goto skip;
}
memcpy(thread->coredump_regs, regs,
sizeof(struct pt_regs));
ret = coredump(thread, regs, sig);
switch (ret) {
case -EBUSY:
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
__func__);
break;
case 0:
coredumped = 0x80;
break;
default:
kprintf("%s: ERROR: coredump failed (%d)\n",
__func__, ret);
break;
}
skip:
terminate(0, sig | coredumped);
break;
case SIGCHLD:
@ -1309,70 +1330,6 @@ out:
return restart;
}
static struct sig_pending *
getsigpending(struct thread *thread, int delflag){
struct list_head *head;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
w = thread->sigmask.__val[0];
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for(;;) {
if (delflag) {
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
}
list_for_each_entry_safe(pending, next, head, list){
if(!(pending->sigmask.__val[0] & w)){
if(delflag)
list_del(&pending->list);
if (delflag) {
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
}
return pending;
}
}
if (delflag) {
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
}
if(lock == &thread->sigpendinglock)
return NULL;
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
return NULL;
}
struct sig_pending *
hassigpending(struct thread *thread)
{
if (list_empty(&thread->sigpending) &&
list_empty(&thread->sigcommon->sigpending)) {
return NULL;
}
return getsigpending(thread, 0);
}
int
interrupt_from_user(void *regs0)
{
@ -1396,185 +1353,6 @@ void save_syscall_return_value(int num, unsigned long rc)
}
}
void
check_signal(unsigned long rc, void *regs0, int num)
{
__check_signal(rc, regs0, num, 0);
}
void
check_signal_irq_disabled(unsigned long rc, void *regs0, int num)
{
__check_signal(rc, regs0, num, 1);
}
static void
__check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
{
ihk_mc_user_context_t *regs = regs0;
struct thread *thread;
struct sig_pending *pending;
int irqstate;
if(clv == NULL)
return;
thread = cpu_local_var(current);
if(thread == NULL || thread->proc->pid == 0){
struct thread *t;
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
if(t->proc->pid <= 0)
continue;
if(t->status == PS_INTERRUPTIBLE &&
hassigpending(t)){
t->status = PS_RUNNING;
break;
}
}
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
goto out;
}
if(regs != NULL && !interrupt_from_user(regs)) {
goto out;
}
if (list_empty(&thread->sigpending) &&
list_empty(&thread->sigcommon->sigpending)) {
goto out;
}
for(;;){
/* When this function called from check_signal_irq_disabled,
* return with interrupt invalid.
* This is to eliminate signal loss.
*/
if (irq_disabled == 1) {
irqstate = cpu_disable_interrupt_save();
}
pending = getsigpending(thread, 1);
if(!pending) {
dkprintf("check_signal,queue is empty\n");
goto out;
}
if (irq_disabled == 1) {
cpu_restore_interrupt(irqstate);
}
if (do_signal(rc, regs, thread, pending, num)) {
num = -1;
}
}
out:
return;
}
static int
check_sig_pending_thread(struct thread *thread)
{
int found = 0;
struct list_head *head;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
__sigset_t x;
int sig = 0;
struct k_sigaction *k;
struct cpu_local_var *v;
v = get_this_cpu_local_var();
w = thread->sigmask.__val[0];
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for (;;) {
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list) {
for (x = pending->sigmask.__val[0], sig = 0; x;
sig++, x >>= 1)
;
k = thread->sigcommon->action + sig - 1;
if ((sig != SIGCHLD && sig != SIGURG) ||
(k->sa.sa_handler != SIG_IGN &&
k->sa.sa_handler != NULL)) {
if (!(pending->sigmask.__val[0] & w)) {
if (pending->interrupted == 0) {
pending->interrupted = 1;
found = 1;
if (sig != SIGCHLD &&
sig != SIGURG &&
!k->sa.sa_handler) {
found = 2;
break;
}
}
}
}
}
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if (found == 2) {
break;
}
if (lock == &thread->sigpendinglock) {
break;
}
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
if (found == 2) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
terminate_mcexec(0, sig);
return 1;
}
else if (found == 1) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
interrupt_syscall(thread, 0);
return 1;
}
return 0;
}
void
check_sig_pending(void)
{
struct thread *thread;
struct cpu_local_var *v;
if (clv == NULL)
return;
v = get_this_cpu_local_var();
repeat:
v->runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
list_for_each_entry(thread, &(v->runq), sched_list) {
if (thread == NULL || thread == &cpu_local_var(idle)) {
continue;
}
if (thread->in_syscall_offload == 0) {
continue;
}
if (thread->proc->group_exit_status & 0x0000000100000000L) {
continue;
}
if (check_sig_pending_thread(thread))
goto repeat;
}
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
}
unsigned long
do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int ptracecont)
{
@ -1590,7 +1368,6 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
struct list_head *head = NULL;
int rc;
unsigned long irqstate = 0;
struct k_sigaction *k;
int doint;
int found = 0;
siginfo_t info0;
@ -1600,6 +1377,7 @@ do_kill(struct thread * thread, int pid, int tid, int sig, siginfo_t *info, int
struct process_hash *phash = rset->process_hash;
struct mcs_rwlock_node lock;
struct mcs_rwlock_node updatelock;
struct sig_pending *pending = NULL;
if(sig > SIGRTMAX || sig < 0)
return -EINVAL;
@ -1786,47 +1564,61 @@ done:
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
/* Put signal event even when handler is SIG_IGN or SIG_DFL
because target ptraced thread must call ptrace_report_signal
in check_signal */
rc = 0;
k = tthread->sigcommon->action + sig - 1;
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
(k->sa.sa_handler != SIG_IGN &&
(k->sa.sa_handler != NULL ||
(sig != SIGCHLD && sig != SIGURG)))) {
struct sig_pending *pending = NULL;
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
list_for_each_entry(pending, head, list){
if(pending->sigmask.__val[0] == mask &&
pending->ptracecont == ptracecont)
break;
}
if(&pending->list == head)
pending = NULL;
if (sig < SIGRTMIN) { // SIGRTMIN - SIGRTMAX
list_for_each_entry(pending, head, list) {
if (pending->sigmask.__val[0] == mask &&
pending->ptracecont == ptracecont)
break;
}
if(pending == NULL){
doint = 1;
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
if(!pending){
rc = -ENOMEM;
}
else{
memset(pending, 0, sizeof(struct sig_pending));
pending->sigmask.__val[0] = mask;
memcpy(&pending->info, info, sizeof(siginfo_t));
pending->ptracecont = ptracecont;
if(sig == SIGKILL || sig == SIGSTOP)
list_add(&pending->list, head);
else
list_add_tail(&pending->list, head);
tthread->sigevent = 1;
}
if (&pending->list == head)
pending = NULL;
}
if (pending == NULL) {
doint = 1;
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
if (!pending) {
rc = -ENOMEM;
}
else {
memset(pending, 0, sizeof(struct sig_pending));
pending->sigmask.__val[0] = mask;
memcpy(&pending->info, info, sizeof(siginfo_t));
pending->ptracecont = ptracecont;
if (sig == SIGKILL || sig == SIGSTOP)
list_add(&pending->list, head);
else
list_add_tail(&pending->list, head);
tthread->sigevent = 1;
}
}
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
cpu_restore_interrupt(irqstate);
if (sig == SIGCONT || ptracecont == 1) {
/* Wake up the target only when stopped by SIGSTOP */
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
struct siginfo info;
tthread->proc->main_thread->signal_flags =
SIGNAL_STOP_CONTINUED;
tthread->proc->status = PS_RUNNING;
memset(&info, '\0', sizeof(info));
info.si_signo = SIGCHLD;
info.si_code = CLD_CONTINUED;
info._sifields._sigchld.si_pid = tthread->proc->pid;
info._sifields._sigchld.si_status = 0x0000ffff;
do_kill(tthread, tthread->proc->parent->pid, -1,
SIGCHLD, &info, 0);
if (thread != tthread) {
ihk_mc_interrupt_cpu(tthread->cpu_id,
ihk_mc_get_vector(IHK_GV_IKC));
}
doint = 0;
}
}
if (doint && !(mask & tthread->sigmask.__val[0])) {
int status = tthread->status;
@ -1841,11 +1633,6 @@ done:
/* Wake up the target only when stopped by ptrace-reporting */
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
}
else if(sig == SIGCONT || ptracecont == 1){
/* Wake up the target only when stopped by SIGSTOP */
sched_wakeup_thread(tthread, PS_STOPPED);
tthread->proc->status = PS_RUNNING;
}
else {
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
}
@ -1870,7 +1657,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
}
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
coredump(thread, regs0);
coredump(thread, regs0, sig);
terminate(0, sig | 0x80);
}
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
@ -1900,7 +1687,7 @@ SYSCALL_DECLARE(mmap)
;
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
const size_t len0 = ihk_mc_syscall_arg1(ctx);
size_t len0 = ihk_mc_syscall_arg1(ctx);
const int prot = ihk_mc_syscall_arg2(ctx);
const int flags0 = ihk_mc_syscall_arg3(ctx);
const int fd = ihk_mc_syscall_arg4(ctx);
@ -1941,7 +1728,8 @@ SYSCALL_DECLARE(mmap)
if (hugeshift == 0) {
/* default hugepage size */
flags |= MAP_HUGE_SECOND_BLOCK;
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
MAP_HUGE_SHIFT;
} else if ((first_level_block_support &&
hugeshift == MAP_HUGE_FIRST_BLOCK) ||
(first_level_block_support &&
@ -1958,6 +1746,14 @@ SYSCALL_DECLARE(mmap)
goto out;
}
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
/* Round-up map length by pagesize */
len0 = ALIGN(len0, pgsize);
if (rusage_check_overmap(len0,
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
error = -ENOMEM;
goto out;
}
}
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
@ -2018,7 +1814,8 @@ SYSCALL_DECLARE(shmget)
if (hugeshift == 0) {
/* default hugepage size */
shmflg |= SHM_HUGE_SECOND_BLOCK;
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
MAP_HUGE_SHIFT;
} else if ((first_level_block_support &&
hugeshift == SHM_HUGE_FIRST_BLOCK) ||
(first_level_block_support &&
@ -2082,11 +1879,13 @@ int do_process_vm_read_writev(int pid,
struct process *rproc;
struct process *lproc = lthread->proc;
struct process_vm *rvm = NULL;
unsigned long rphys;
unsigned long rpage_left;
unsigned long psize;
void *rva;
unsigned long lphys, rphys;
unsigned long lpage_left, rpage_left;
unsigned long lpsize, rpsize;
void *rva, *lva;
#if 0
struct vm_range *range;
#endif
struct mcs_rwlock_node_irqsave lock;
struct mcs_rwlock_node update_lock;
@ -2099,8 +1898,9 @@ int do_process_vm_read_writev(int pid,
return -EINVAL;
}
#if 0
/* Check if parameters are okay */
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&lthread->vm->memory_range_lock);
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov,
@ -2122,11 +1922,12 @@ int do_process_vm_read_writev(int pid,
ret = 0;
arg_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
}
#endif
for (li = 0; li < liovcnt; ++li) {
llen += local_iov[li].iov_len;
@ -2191,7 +1992,7 @@ arg_out:
if (pli != li) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&lthread->vm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(lthread->vm,
@ -2221,7 +2022,7 @@ arg_out:
ret = 0;
pli_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
@ -2234,7 +2035,7 @@ pli_out:
if (pri != ri) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(rvm,
@ -2264,7 +2065,7 @@ pli_out:
ret = 0;
pri_out:
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
if (ret != 0) {
goto out;
@ -2279,10 +2080,53 @@ pri_out:
to_copy = remote_iov[ri].iov_len - roff;
}
retry_lookup:
retry_llookup:
/* Figure out local physical */
/* TODO: remember page and do this only if necessary */
ret = ihk_mc_pt_virt_to_phys_size(lthread->vm->address_space->page_table,
local_iov[li].iov_base + loff, &lphys, &lpsize);
if (ret) {
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
void *addr;
if (faulted) {
ret = -EFAULT;
goto out;
}
/* Fault in pages */
for (addr = (void *)
(((unsigned long)local_iov[li].iov_base + loff)
& PAGE_MASK);
addr < (local_iov[li].iov_base + loff + to_copy);
addr += PAGE_SIZE) {
ret = page_fault_process_vm(lthread->vm, addr, reason);
if (ret) {
ret = -EFAULT;
goto out;
}
}
faulted = 1;
goto retry_llookup;
}
lpage_left = ((((unsigned long)local_iov[li].iov_base + loff +
lpsize) & ~(lpsize - 1)) -
((unsigned long)local_iov[li].iov_base + loff));
if (lpage_left < to_copy) {
to_copy = lpage_left;
}
lva = phys_to_virt(lphys);
retry_rlookup:
/* Figure out remote physical */
/* TODO: remember page and do this only if necessary */
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
remote_iov[ri].iov_base + roff, &rphys, &psize);
remote_iov[ri].iov_base + roff, &rphys, &rpsize);
if (ret) {
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
@ -2308,11 +2152,11 @@ retry_lookup:
}
faulted = 1;
goto retry_lookup;
goto retry_rlookup;
}
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
psize) & ~(psize - 1)) -
rpsize) & ~(rpsize - 1)) -
((unsigned long)remote_iov[ri].iov_base + roff));
if (rpage_left < to_copy) {
to_copy = rpage_left;
@ -2321,16 +2165,16 @@ retry_lookup:
rva = phys_to_virt(rphys);
fast_memcpy(
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
(op == PROCESS_VM_READ) ? lva : rva,
(op == PROCESS_VM_READ) ? rva : lva,
to_copy);
copied += to_copy;
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, rpsize: %lu, rpage_left: %lu\n",
li, local_iov[li].iov_base + loff,
(op == PROCESS_VM_READ) ? "<-" : "->",
ri, remote_iov[ri].iov_base + roff, to_copy,
psize, rpage_left);
rpsize, rpage_left);
loff += to_copy;
roff += to_copy;
@ -2700,4 +2544,48 @@ SYSCALL_DECLARE(time)
return time();
}
void calculate_time_from_tsc(struct timespec *ts)
{
long ver;
unsigned long current_tsc;
time_t sec_delta;
long ns_delta;
for (;;) {
while ((ver = ihk_atomic64_read(&tod_data.version)) & 1) {
/* settimeofday() is in progress */
cpu_pause();
}
rmb(); /* fetch version before time */
*ts = tod_data.origin;
rmb(); /* fetch time before checking version */
if (ver == ihk_atomic64_read(&tod_data.version)) {
break;
}
/* settimeofday() has intervened */
cpu_pause();
}
current_tsc = rdtsc();
sec_delta = current_tsc / tod_data.clocks_per_sec;
ns_delta = NS_PER_SEC * (current_tsc % tod_data.clocks_per_sec)
/ tod_data.clocks_per_sec;
/* calc. of ns_delta overflows if clocks_per_sec exceeds 18.44 GHz */
ts->tv_sec += sec_delta;
ts->tv_nsec += ns_delta;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
}
extern void ptrace_syscall_event(struct thread *thread);
long arch_ptrace_syscall_event(struct thread *thread,
ihk_mc_user_context_t *ctx, long setret)
{
ptrace_syscall_event(thread);
return setret;
}
/*** End of File ***/

View File

@ -8,7 +8,7 @@
#include <cputype.h>
#include <irq.h>
#include <arch-timer.h>
#include <debug.h>
#include <ihk/debug.h>
//#define DEBUG_PRINT_TIMER
@ -111,6 +111,8 @@ static void timer_handler(void *priv)
/* set timer re-enable for periodic */
arch_timer_reg_write(ARCH_TIMER_REG_TVAL, clocks);
arch_timer_reg_write(ARCH_TIMER_REG_CTRL, ctrl);
do_backlog();
}
}

View File

@ -11,10 +11,9 @@
#include <process.h>
#include <string.h>
#include <syscall.h>
#include <ihk/debug.h>
#include <ikc/queue.h>
#include <vdso.h>
#include <debug.h>
#include <ihk/debug.h>
//#define DEBUG_PRINT_VDSO
@ -23,7 +22,6 @@
#define DDEBUG_DEFAULT DDEBUG_PRINT
#endif
#ifdef POSTK_DEBUG_ARCH_DEP_52
#define VDSO_MAXPAGES 1
struct vdso {
long busy;
@ -34,7 +32,6 @@ struct vdso {
long lbase;
long offset_sigtramp;
};
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
extern char vdso_start, vdso_end;
static struct vdso vdso;
@ -90,6 +87,7 @@ int arch_setup_vdso(void)
}
panic("Only support host mapping vDSO");
return -1;
}
static int get_free_area(struct process_vm *vm, size_t len, intptr_t hint,

View File

@ -18,7 +18,7 @@ extern char data_start[], data_end[];
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
#define MAP_ST_START 0xffff800000000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
/* MAP_KERNEL_START is defined by cmake */
#define PTL4_SHIFT 39
#define PTL3_SHIFT 30

View File

@ -1,8 +1,9 @@
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018 */
/* coredump.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
#include <process.h>
#include <elfcore.h>
void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread, void *regs0)
void arch_fill_prstatus(struct elf_prstatus64 *prstatus,
struct thread *thread, void *regs0, int sig)
{
struct x86_user_context *uctx = regs0;
struct x86_basic_regs *regs = &uctx->gpr;
@ -18,8 +19,6 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
short int pr_cursig;
a8_uint64_t pr_sigpend;
a8_uint64_t pr_sighold;
pid_t pr_pid;
pid_t pr_ppid;
pid_t pr_pgrp;
pid_t pr_sid;
struct prstatus64_timeval pr_utime;
@ -28,6 +27,14 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
struct prstatus64_timeval pr_cstime;
*/
prstatus->pr_pid = thread->tid;
if (thread->proc->parent) {
prstatus->pr_ppid = thread->proc->parent->pid;
}
prstatus->pr_info.si_signo = sig;
prstatus->pr_cursig = sig;
prstatus->pr_reg[0] = _r15;
prstatus->pr_reg[1] = _r14;
prstatus->pr_reg[2] = _r13;
@ -55,3 +62,13 @@ void arch_fill_prstatus(struct elf_prstatus64 *prstatus, struct thread *thread,
prstatus->pr_fpvalid = 0; /* We assume no fp */
}
void arch_fill_thread_core_info(struct note *head,
struct thread *thread, void *regs)
{
}
int arch_get_thread_core_info_size(void)
{
return 0;
}

View File

@ -1,4 +1,4 @@
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018 */
/* cpu.c COPYRIGHT FUJITSU LIMITED 2018-2019 */
/**
* \file cpu.c
* License details are found in the file LICENSE.
@ -16,7 +16,6 @@
*/
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <ihk/mm.h>
#include <types.h>
#include <errno.h>
@ -32,7 +31,7 @@
#include <prctl.h>
#include <page.h>
#include <kmalloc.h>
#include <debug.h>
#include <ihk/debug.h>
#define LAPIC_ID 0x020
#define LAPIC_TIMER 0x320
@ -45,11 +44,9 @@
#define LAPIC_ICR0 0x300
#define LAPIC_ICR2 0x310
#define LAPIC_ESR 0x280
#ifdef POSTK_DEBUG_ARCH_DEP_75 /* x86 depend hide */
#define LOCAL_TIMER_VECTOR 0xef
#define LOCAL_PERF_VECTOR 0xf0
#define LOCAL_SMP_FUNC_CALL_VECTOR 0xf1
#endif /* POSTK_DEBUG_ARCH_DEP_75 */
#define APIC_INT_LEVELTRIG 0x08000
#define APIC_INT_ASSERT 0x04000
@ -148,7 +145,7 @@ void reload_idt(void)
}
static struct list_head handlers[256 - 32];
extern char nmi[];
extern char nmi_handler[];
extern char page_fault[], general_protection_exception[];
extern char debug_exception[], int3_exception[];
@ -175,7 +172,7 @@ static void init_idt(void)
set_idt_entry(i, generic_common_handlers[i]);
}
set_idt_entry(2, (uintptr_t)nmi);
set_idt_entry(2, (uintptr_t)nmi_handler);
set_idt_entry(13, (unsigned long)general_protection_exception);
set_idt_entry(14, (unsigned long)page_fault);
@ -955,6 +952,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
v->flags |= CPU_FLAG_NEED_RESCHED;
ihk_mc_spinlock_unlock(&v->runq_lock, irqstate);
dkprintf("timer[%lu]: CPU_FLAG_NEED_RESCHED \n", rdtsc());
do_backlog();
}
else if (vector == LOCAL_PERF_VECTOR) {
struct siginfo info;
@ -1206,6 +1205,15 @@ unsigned long cpu_disable_interrupt_save(void)
return flags;
}
unsigned long cpu_enable_interrupt_save(void)
{
unsigned long flags;
asm volatile("pushf; pop %0; sti" : "=r"(flags) : : "memory", "cc");
return flags;
}
/*@
@ behavior valid_vector:
@ assumes 32 <= vector <= 255;
@ -1602,14 +1610,18 @@ int ihk_mc_arch_get_special_register(enum ihk_asr_type type,
}
/*@
@ requires \valid_apicid(cpu); // valid APIC ID or not
@ requires \valid_cpuid(cpu); // valid CPU logical ID
@ ensures \result == 0
@*/
int ihk_mc_interrupt_cpu(int cpu, int vector)
{
if (cpu < 0 || cpu >= num_processors) {
kprintf("%s: invalid CPU id: %d\n", __func__, cpu);
return -1;
}
dkprintf("[%d] ihk_mc_interrupt_cpu: %d\n", ihk_mc_get_processor_id(), cpu);
x86_issue_ipi(cpu, vector);
x86_issue_ipi(get_x86_cpu_local_variable(cpu)->apic_id, vector);
return 0;
}
@ -1624,6 +1636,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
/* Set up new TLS.. */
ihk_mc_init_user_tlsbase(next->uctx, next->tlsblock_base);
#ifdef ENABLE_PERF
/* Performance monitoring inherit */
if(next->proc->monitoring_event) {
if(next->proc->perf_status == PP_RESET)
@ -1633,6 +1646,7 @@ struct thread *arch_switch_context(struct thread *prev, struct thread *next)
perf_start(next->proc->monitoring_event);
}
}
#endif
#ifdef PROFILE_ENABLE
if (prev && prev->profile && prev->profile_start_ts != 0) {
@ -1708,7 +1722,7 @@ check_and_allocate_fp_regs(struct thread *thread)
if (!thread->fp_regs) {
kprintf("error: allocating fp_regs pages\n");
result = 1;
result = -ENOMEM;
goto out;
}
@ -1721,12 +1735,14 @@ out:
/*@
@ requires \valid(thread);
@*/
void
int
save_fp_regs(struct thread *thread)
{
if (check_and_allocate_fp_regs(thread) != 0) {
// alloc error
return;
int ret = 0;
ret = check_and_allocate_fp_regs(thread);
if (ret) {
goto out;
}
if (xsave_available) {
@ -1741,13 +1757,23 @@ save_fp_regs(struct thread *thread)
dkprintf("fp_regs for TID %d saved\n", thread->tid);
}
out:
return ret;
}
void copy_fp_regs(struct thread *from, struct thread *to)
int copy_fp_regs(struct thread *from, struct thread *to)
{
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
int ret = 0;
if (from->fp_regs != NULL) {
ret = check_and_allocate_fp_regs(to);
if (!ret) {
memcpy(to->fp_regs,
from->fp_regs,
sizeof(fp_regs_struct));
}
}
return ret;
}
/*@
@ -1820,6 +1846,10 @@ ihk_mc_init_user_tlsbase(ihk_mc_user_context_t *ctx,
do_arch_prctl(ARCH_SET_FS, tls_base_addr);
}
void arch_flush_icache_all(void)
{
return;
}
/*@
@ assigns \nothing;
@ -1973,6 +2003,92 @@ mod_nmi_ctx(void *nmi_ctx, void (*func)())
l[i++] = 0x28; // KERNEL DS
}
void arch_save_panic_regs(void *irq_regs)
{
struct thread *current = cpu_local_var(current);
struct x86_user_context *regs =
(struct x86_user_context *)irq_regs;
struct x86_cpu_local_variables *x86v =
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
struct segment_regs {
uint32_t rflags;
uint32_t cs;
uint32_t ss;
uint32_t ds;
uint32_t es;
uint32_t fs;
uint32_t gs;
} *sregs;
/* Kernel space? */
if (regs->gpr.rip > USER_END) {
x86v->panic_regs[0] = regs->gpr.rax;
x86v->panic_regs[1] = regs->gpr.rbx;
x86v->panic_regs[2] = regs->gpr.rcx;
x86v->panic_regs[3] = regs->gpr.rdx;
x86v->panic_regs[4] = regs->gpr.rsi;
x86v->panic_regs[5] = regs->gpr.rdi;
x86v->panic_regs[6] = regs->gpr.rbp;
x86v->panic_regs[7] = regs->gpr.rsp;
x86v->panic_regs[8] = regs->gpr.r8;
x86v->panic_regs[9] = regs->gpr.r9;
x86v->panic_regs[10] = regs->gpr.r10;
x86v->panic_regs[11] = regs->gpr.r11;
x86v->panic_regs[12] = regs->gpr.r12;
x86v->panic_regs[13] = regs->gpr.r13;
x86v->panic_regs[14] = regs->gpr.r14;
x86v->panic_regs[15] = regs->gpr.r15;
x86v->panic_regs[16] = regs->gpr.rip;
sregs = (struct segment_regs *)&x86v->panic_regs[17];
sregs->rflags = regs->gpr.rflags;
sregs->cs = regs->gpr.cs;
sregs->ss = regs->gpr.ss;
sregs->ds = regs->sr.ds;
sregs->es = regs->sr.es;
sregs->fs = regs->sr.fs;
sregs->gs = regs->sr.gs;
}
/* User-space, show kernel context */
else {
kprintf("%s: in user-space: %p\n", __func__, regs->gpr.rip);
x86v->panic_regs[0] = 0;
x86v->panic_regs[1] = current->ctx.rbx;
x86v->panic_regs[2] = 0;
x86v->panic_regs[3] = 0;
x86v->panic_regs[4] = current->ctx.rsi;
x86v->panic_regs[5] = current->ctx.rdi;
x86v->panic_regs[6] = current->ctx.rbp;
x86v->panic_regs[7] = current->ctx.rsp;
x86v->panic_regs[8] = 0;
x86v->panic_regs[9] = 0;
x86v->panic_regs[10] = 0;
x86v->panic_regs[11] = 0;
x86v->panic_regs[12] = regs->gpr.r12;
x86v->panic_regs[13] = regs->gpr.r13;
x86v->panic_regs[14] = regs->gpr.r14;
x86v->panic_regs[15] = regs->gpr.r15;
x86v->panic_regs[16] = (unsigned long)enter_user_mode;
sregs = (struct segment_regs *)&x86v->panic_regs[17];
sregs->rflags = regs->gpr.rflags;
sregs->cs = regs->gpr.cs;
sregs->ss = regs->gpr.ss;
sregs->ds = regs->sr.ds;
sregs->es = regs->sr.es;
sregs->fs = regs->sr.fs;
sregs->gs = regs->sr.gs;
}
x86v->paniced = 1;
}
void arch_clear_panic(void)
{
struct x86_cpu_local_variables *x86v =
get_x86_cpu_local_variable(ihk_mc_get_processor_id());
x86v->paniced = 0;
}
int arch_cpu_read_write_register(
struct ihk_os_cpu_register *desc,
enum mcctrl_os_cpu_operation op)
@ -2096,9 +2212,7 @@ int smp_call_func(cpu_set_t *__cpu_set, smp_func_t __func, void *__arg)
ihk_mc_spinlock_unlock(&get_cpu_local_var(cpu)->smp_func_req_lock,
irq_flags);
ihk_mc_interrupt_cpu(
get_x86_cpu_local_variable(cpu)->apic_id,
LOCAL_SMP_FUNC_CALL_VECTOR);
ihk_mc_interrupt_cpu(cpu, LOCAL_SMP_FUNC_CALL_VECTOR);
++cpu_index;
}
@ -2130,4 +2244,48 @@ free_out:
return ret;
}
extern int nmi_mode;
extern long freeze_thaw(void *nmi_ctx);
void multi_nm_interrupt_handler(void *irq_regs)
{
dkprintf("%s: ...\n", __func__);
switch (nmi_mode) {
case 1:
case 2:
/* mode == 1 or 2, for FREEZER NMI */
dkprintf("%s: freeze mode NMI catch. (nmi_mode=%d)\n",
__func__, nmi_mode);
freeze_thaw(NULL);
break;
case 0:
/* mode == 0, for MEMDUMP NMI */
arch_save_panic_regs(irq_regs);
ihk_mc_query_mem_areas();
/* memdump-nmi is halted McKernel, break is unnecessary. */
/* fall through */
case 3:
/* mode == 3, for SHUTDOWN-WAIT NMI */
kprintf("%s: STOP\n", __func__);
while (nmi_mode != 4)
cpu_halt();
break;
case 4:
/* mode == 4, continue NMI */
arch_clear_panic();
if (!ihk_mc_get_processor_id()) {
ihk_mc_clear_dump_page_completion();
}
kprintf("%s: RESUME, nmi_mode: %d\n", __func__, nmi_mode);
break;
default:
ekprintf("%s: Unknown nmi-mode(%d) detected.\n",
__func__, nmi_mode);
break;
}
}
/*** end of file ***/

View File

@ -33,7 +33,7 @@ extern void preempt_disable(void);
#define IHK_STATIC_SPINLOCK_FUNCS
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
static inline void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
{
lock->head_tail = 0;
}
@ -50,10 +50,13 @@ rc = __ihk_mc_spinlock_trylock_noirq(l); \
#define ihk_mc_spinlock_trylock_noirq __ihk_mc_spinlock_trylock_noirq
#endif
static int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
static inline int __ihk_mc_spinlock_trylock_noirq(ihk_spinlock_t *lock)
{
ihk_spinlock_t cur = { .head_tail = lock->head_tail };
ihk_spinlock_t next = { .tickets.head = cur.tickets.head, .tickets.tail = cur.tickets.tail + 2 };
ihk_spinlock_t next = { .tickets = {
.head = cur.tickets.head,
.tail = cur.tickets.tail + 2
} };
int success;
if (cur.tickets.head != cur.tickets.tail) {
@ -80,7 +83,8 @@ __kprintf("[%d] ret ihk_mc_spinlock_trylock\n", ihk_mc_get_processor_id()); rc;\
#else
#define ihk_mc_spinlock_trylock __ihk_mc_spinlock_trylock
#endif
static unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock, int *result)
static inline unsigned long __ihk_mc_spinlock_trylock(ihk_spinlock_t *lock,
int *result)
{
unsigned long flags;
@ -101,7 +105,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
#define ihk_mc_spinlock_lock_noirq __ihk_mc_spinlock_lock_noirq
#endif
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
static inline void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
{
register struct __raw_tickets inc = { .tail = 0x0002 };
@ -132,7 +136,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock\n", ihk_mc_get_processor_id()); rc;\
#else
#define ihk_mc_spinlock_lock __ihk_mc_spinlock_lock
#endif
static unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
static inline unsigned long __ihk_mc_spinlock_lock(ihk_spinlock_t *lock)
{
unsigned long flags;
@ -152,7 +156,7 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
#else
#define ihk_mc_spinlock_unlock_noirq __ihk_mc_spinlock_unlock_noirq
#endif
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
static inline void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
{
__ticket_t inc = 0x0002;
@ -171,100 +175,14 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock\n", ihk_mc_get_processor_id()); \
#else
#define ihk_mc_spinlock_unlock __ihk_mc_spinlock_unlock
#endif
static void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock, unsigned long flags)
static inline void __ihk_mc_spinlock_unlock(ihk_spinlock_t *lock,
unsigned long flags)
{
__ihk_mc_spinlock_unlock_noirq(lock);
cpu_restore_interrupt(flags);
}
/* An implementation of the Mellor-Crummey Scott (MCS) lock */
typedef struct mcs_lock_node {
unsigned long locked;
struct mcs_lock_node *next;
unsigned long irqsave;
#ifndef ENABLE_UBSAN
} __aligned(64) mcs_lock_node_t;
#else
} mcs_lock_node_t;
#endif
typedef mcs_lock_node_t mcs_lock_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;
node->next = NULL;
}
static void __mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
struct mcs_lock_node *pred;
node->next = NULL;
node->locked = 0;
pred = (struct mcs_lock_node *)xchg8((unsigned long *)&lock->next,
(unsigned long)node);
if (pred) {
node->locked = 1;
pred->next = node;
while (node->locked != 0) {
cpu_pause();
}
}
}
static void __mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
if (node->next == NULL) {
struct mcs_lock_node *old = (struct mcs_lock_node *)
atomic_cmpxchg8((unsigned long *)&lock->next,
(unsigned long)node, (unsigned long)0);
if (old == node) {
return;
}
while (node->next == NULL) {
cpu_pause();
}
}
node->next->locked = 0;
}
static void mcs_lock_lock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
preempt_disable();
__mcs_lock_lock(lock, node);
}
static void mcs_lock_unlock_noirq(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
__mcs_lock_unlock(lock, node);
preempt_enable();
}
static void mcs_lock_lock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
node->irqsave = cpu_disable_interrupt_save();
mcs_lock_lock_noirq(lock, node);
}
static void mcs_lock_unlock(struct mcs_lock_node *lock,
struct mcs_lock_node *node)
{
mcs_lock_unlock_noirq(lock, node);
cpu_restore_interrupt(node->irqsave);
}
#define SPINLOCK_IN_MCS_RWLOCK
// reader/writer lock
@ -310,7 +228,7 @@ typedef struct mcs_rwlock_lock {
} mcs_rwlock_lock_t;
#endif
static void
static inline void
mcs_rwlock_init(struct mcs_rwlock_lock *lock)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -331,7 +249,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock_noirq\n", ihk_mc_get_processor_id());
#else
#define mcs_rwlock_writer_lock_noirq __mcs_rwlock_writer_lock_noirq
#endif
static void
static inline void
__mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -358,7 +276,7 @@ __mcs_rwlock_writer_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_n
}
#ifndef SPINLOCK_IN_MCS_RWLOCK
static void
static inline void
mcs_rwlock_unlock_readers(struct mcs_rwlock_lock *lock)
{
struct mcs_rwlock_node *p;
@ -425,7 +343,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock_noirq\n", ihk_mc_get_processor_id()
#else
#define mcs_rwlock_writer_unlock_noirq __mcs_rwlock_writer_unlock_noirq
#endif
static void
static inline void
__mcs_rwlock_writer_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -485,7 +403,7 @@ atomic_inc_ifnot0(ihk_atomic_t *v)
return old;
}
static void
static inline void
__mcs_rwlock_reader_lock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -551,7 +469,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock_noirq\n", ihk_mc_get_processor_id()
#else
#define mcs_rwlock_reader_unlock_noirq __mcs_rwlock_reader_unlock_noirq
#endif
static void
static inline void
__mcs_rwlock_reader_unlock_noirq(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -598,7 +516,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_lock\n", ihk_mc_get_processor_id()); \
#else
#define mcs_rwlock_writer_lock __mcs_rwlock_writer_lock
#endif
static void
static inline void
__mcs_rwlock_writer_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -618,7 +536,7 @@ __kprintf("[%d] ret mcs_rwlock_writer_unlock\n", ihk_mc_get_processor_id()); \
#else
#define mcs_rwlock_writer_unlock __mcs_rwlock_writer_unlock
#endif
static void
static inline void
__mcs_rwlock_writer_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -638,7 +556,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_lock\n", ihk_mc_get_processor_id()); \
#else
#define mcs_rwlock_reader_lock __mcs_rwlock_reader_lock
#endif
static void
static inline void
__mcs_rwlock_reader_lock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -658,7 +576,7 @@ __kprintf("[%d] ret mcs_rwlock_reader_unlock\n", ihk_mc_get_processor_id()); \
#else
#define mcs_rwlock_reader_unlock __mcs_rwlock_reader_unlock
#endif
static void
static inline void
__mcs_rwlock_reader_unlock(struct mcs_rwlock_lock *lock, struct mcs_rwlock_node_irqsave *node)
{
#ifdef SPINLOCK_IN_MCS_RWLOCK
@ -674,4 +592,90 @@ static inline int irqflags_can_interrupt(unsigned long flags)
return !!(flags & 0x200);
}
struct ihk_rwlock {
union {
long lock;
struct {
unsigned int read;
int write;
};
} lock;
};
static inline void ihk_mc_rwlock_init(struct ihk_rwlock *rw)
{
rw->lock.read = 0;
rw->lock.write = 1;
}
static inline void ihk_mc_read_lock(struct ihk_rwlock *rw)
{
asm volatile("1:\t"
"lock; decq %0\n\t"
"jns 3f\n\t"
"lock incq %0\n\t"
"2:\t"
"pause\n\t"
"cmpq $0x1, %0\n\t"
"jns 1b\n\t"
"jmp 2b\n\t"
"3:"
: "+m" (rw->lock.lock) : : "memory");
}
static inline void ihk_mc_write_lock(struct ihk_rwlock *rw)
{
asm volatile("1:\t"
"lock; decl %0\n\t"
"je 3f\n\t"
"lock; incl %0\n\t"
"2:\t"
"pause\n\t"
"cmpl $0x1,%0\n\t"
"je 1b\n\t"
"jmp 2b\n\t"
"3:"
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
}
static inline int ihk_mc_read_trylock(struct ihk_rwlock *rw)
{
ihk_atomic64_t *count = (ihk_atomic64_t *)rw;
if (ihk_atomic64_sub_return(1, count) >= 0)
return 1;
ihk_atomic64_inc(count);
return 0;
}
static inline int ihk_mc_write_trylock(struct ihk_rwlock *rw)
{
ihk_atomic_t *count = (ihk_atomic_t *)&rw->lock.write;
if (ihk_atomic_dec_and_test(count))
return 1;
ihk_atomic_inc(count);
return 0;
}
static inline void ihk_mc_read_unlock(struct ihk_rwlock *rw)
{
asm volatile("lock; incq %0" : "+m" (rw->lock.lock) : : "memory");
}
static inline void ihk_mc_write_unlock(struct ihk_rwlock *rw)
{
asm volatile("lock; incl %0"
: "+m" (rw->lock.write) : "i" (((1L) << 32)) : "memory");
}
static inline int ihk_mc_write_can_lock(struct ihk_rwlock *rw)
{
return rw->lock.write == 1;
}
static inline int ihk_mc_read_can_lock(struct ihk_rwlock *rw)
{
return rw->lock.lock > 0;
}
#endif

View File

@ -17,6 +17,7 @@
#define __HEADER_X86_COMMON_ARCH_MEMORY_H
#include <ihk/types.h>
#include <errno.h>
#define KERNEL_CS_ENTRY 4
#define KERNEL_DS_ENTRY 5
@ -66,8 +67,8 @@
* Placing the LWK image in the virtual address space at the end of
* the Linux modules section enables us to map the LWK TEXT in Linux
* as well, so that Linux can also call into LWK text.
* It's defined by cmake.
*/
#define MAP_KERNEL_START 0xFFFFFFFFFE800000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
@ -183,12 +184,10 @@ enum ihk_mc_pt_attribute {
enum ihk_mc_pt_attribute attr_mask;
#ifdef POSTK_DEBUG_ARCH_DEP_12
static inline int pfn_is_write_combined(uintptr_t pfn)
{
return ((pfn & PFL1_PWT) && !(pfn & PFL1_PCD));
}
#endif /* #ifdef POSTK_DEBUG_ARCH_DEP_12 */
static inline int pte_is_null(pte_t *ptep)
{
@ -365,6 +364,17 @@ static inline int pgsize_to_tbllv(size_t pgsize)
return 0;
}
static inline int pgsize_to_pgshift(size_t pgsize)
{
switch (pgsize) {
case PTL1_SIZE: return PTL1_SHIFT;
case PTL2_SIZE: return PTL2_SHIFT;
case PTL3_SIZE: return PTL3_SHIFT;
case PTL4_SIZE: return PTL4_SHIFT;
default: return -EINVAL;
}
}
static inline size_t tbllv_to_pgsize(int level)
{
switch (level) {

View File

@ -13,19 +13,17 @@
#ifndef ARCH_CPU_H
#define ARCH_CPU_H
#define mb() asm volatile("mfence":::"memory")
#define rmb() asm volatile("lfence":::"memory")
#define wmb() asm volatile("sfence" ::: "memory")
#define smp_mb() mb()
#define smp_rmb() rmb()
#define smp_wmb() barrier()
#define arch_barrier() asm volatile("" : : : "memory")
static inline void rmb(void)
{
arch_barrier();
}
static inline void wmb(void)
{
arch_barrier();
}
static unsigned long read_tsc(void)
static inline unsigned long read_tsc(void)
{
unsigned int low, high;
@ -34,4 +32,21 @@ static unsigned long read_tsc(void)
return (low | ((unsigned long)high << 32));
}
#define smp_load_acquire(p) \
({ \
typeof(*p) ___p1 = ACCESS_ONCE(*p); \
compiletime_assert_atomic_type(*p); \
barrier(); \
___p1; \
})
#define smp_store_release(p, v) \
({ \
compiletime_assert_atomic_type(*p); \
barrier(); \
WRITE_ONCE(*p, v); \
})
void arch_flush_icache_all(void);
#endif /* ARCH_CPU_H */

View File

@ -1,32 +0,0 @@
#ifndef ARCH_RUSAGE_H_INCLUDED
#define ARCH_RUSAGE_H_INCLUDED
#define DEBUG_RUSAGE
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_2MB 1
#define IHK_OS_PGSIZE_1GB 2
extern struct rusage_global rusage;
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
{
int ret = IHK_OS_PGSIZE_4KB;
switch (pgsize) {
case PTL1_SIZE:
ret = IHK_OS_PGSIZE_4KB;
break;
case PTL2_SIZE:
ret = IHK_OS_PGSIZE_2MB;
break;
case PTL3_SIZE:
ret = IHK_OS_PGSIZE_1GB;
break;
default:
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
break;
}
return ret;
}
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */

View File

@ -13,6 +13,8 @@
#ifndef HEADER_X86_COMMON_IHK_ATOMIC_H
#define HEADER_X86_COMMON_IHK_ATOMIC_H
#include <lwk/compiler.h>
/***********************************************************************
* ihk_atomic_t
*/
@ -114,7 +116,7 @@ static inline long ihk_atomic64_read(const ihk_atomic64_t *v)
return *(volatile long *)&(v)->counter64;
}
static inline void ihk_atomic64_set(ihk_atomic64_t *v, int i)
static inline void ihk_atomic64_set(ihk_atomic64_t *v, long i)
{
v->counter64 = i;
}
@ -124,6 +126,22 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
asm volatile ("lock incq %0" : "+m"(v->counter64));
}
static inline long ihk_atomic64_add_return(long i, ihk_atomic64_t *v)
{
long __i;
__i = i;
asm volatile("lock xaddq %0, %1"
: "+r" (i), "+m" (v->counter64)
: : "memory");
return i + __i;
}
static inline long ihk_atomic64_sub_return(long i, ihk_atomic64_t *v)
{
return ihk_atomic64_add_return(-i, v);
}
/***********************************************************************
* others
*/
@ -156,43 +174,55 @@ static inline unsigned long xchg8(unsigned long *ptr, unsigned long x)
return __x;
}
#define __xchg(x, ptr, size) \
({ \
__typeof(*(ptr)) __x = (x); \
switch (size) { \
case 1: \
asm volatile("xchgb %b0,%1" \
: "=q" (__x) \
: "m" (*__xg(ptr)), "0" (__x) \
: "memory"); \
break; \
case 2: \
asm volatile("xchgw %w0,%1" \
: "=r" (__x) \
: "m" (*__xg(ptr)), "0" (__x) \
: "memory"); \
break; \
case 4: \
asm volatile("xchgl %k0,%1" \
: "=r" (__x) \
: "m" (*__xg(ptr)), "0" (__x) \
: "memory"); \
break; \
case 8: \
asm volatile("xchgq %0,%1" \
: "=r" (__x) \
: "m" (*__xg(ptr)), "0" (__x) \
: "memory"); \
break; \
default: \
panic("xchg for wrong size"); \
} \
__x; \
})
#define __X86_CASE_B 1
#define __X86_CASE_W 2
#define __X86_CASE_L 4
#define __X86_CASE_Q 8
extern void __xchg_wrong_size(void)
__compiletime_error("Bad argument size for xchg");
#define xchg(ptr, v) \
__xchg((v), (ptr), sizeof(*ptr))
/*
* An exchange-type operation, which takes a value and a pointer, and
* returns the old value.
*/
#define __xchg_op(ptr, arg, op, lock) \
({ \
__typeof__(*(ptr)) __ret = (arg); \
switch (sizeof(*(ptr))) { \
case __X86_CASE_B: \
asm volatile (lock #op "b %b0, %1\n" \
: "+q" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_W: \
asm volatile (lock #op "w %w0, %1\n" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_L: \
asm volatile (lock #op "l %0, %1\n" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
case __X86_CASE_Q: \
asm volatile (lock #op "q %q0, %1\n" \
: "+r" (__ret), "+m" (*(ptr)) \
: : "memory", "cc"); \
break; \
default: \
__xchg_wrong_size(); \
} \
__ret; \
})
/*
* Note: no "lock" prefix even on SMP: xchg always implies lock anyway.
* Since this is generally used to protect other memory information, we
* use "asm volatile" and "memory" clobbers to prevent gcc from moving
* information around.
*/
#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "")
static inline unsigned long atomic_cmpxchg8(unsigned long *addr,
unsigned long oldval,
@ -241,4 +271,66 @@ static inline unsigned long ihk_atomic_add_long_return(long i, long *v) {
return i + __i;
}
extern void __cmpxchg_wrong_size(void)
__compiletime_error("Bad argument size for cmpxchg");
/*
* Atomic compare and exchange. Compare OLD with MEM, if identical,
* store NEW in MEM. Return the initial value in MEM. Success is
* indicated by comparing RETURN with OLD.
*/
#define __raw_cmpxchg(ptr, old, new, size, lock) \
({ \
__typeof__(*(ptr)) __ret; \
__typeof__(*(ptr)) __old = (old); \
__typeof__(*(ptr)) __new = (new); \
switch (size) { \
case __X86_CASE_B: \
{ \
volatile uint8_t *__ptr = (volatile uint8_t *)(ptr);\
asm volatile(lock "cmpxchgb %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "q" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_W: \
{ \
volatile uint16_t *__ptr = (volatile uint16_t *)(ptr);\
asm volatile(lock "cmpxchgw %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_L: \
{ \
volatile uint32_t *__ptr = (volatile uint32_t *)(ptr);\
asm volatile(lock "cmpxchgl %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
case __X86_CASE_Q: \
{ \
volatile uint64_t *__ptr = (volatile uint64_t *)(ptr);\
asm volatile(lock "cmpxchgq %2,%1" \
: "=a" (__ret), "+m" (*__ptr) \
: "r" (__new), "0" (__old) \
: "memory"); \
break; \
} \
default: \
__cmpxchg_wrong_size(); \
} \
__ret; \
})
#define __cmpxchg(ptr, old, new, size) \
__raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
#define cmpxchg(ptr, old, new) \
__cmpxchg(ptr, old, new, sizeof(*(ptr)))
#endif

View File

@ -71,7 +71,7 @@
#define MSR_PERF_CTL_0 0xc0010000
#define MSR_PERF_CTR_0 0xc0010004
static unsigned long xgetbv(unsigned int index)
static inline unsigned long xgetbv(unsigned int index)
{
unsigned int low, high;
@ -80,7 +80,7 @@ static unsigned long xgetbv(unsigned int index)
return low | ((unsigned long)high << 32);
}
static void xsetbv(unsigned int index, unsigned long val)
static inline void xsetbv(unsigned int index, unsigned long val)
{
unsigned int low, high;
@ -90,7 +90,8 @@ static void xsetbv(unsigned int index, unsigned long val)
asm volatile("xsetbv" : : "a" (low), "d" (high), "c" (index));
}
static void wrmsr(unsigned int idx, unsigned long value){
static inline void wrmsr(unsigned int idx, unsigned long value)
{
unsigned int high, low;
high = value >> 32;
@ -99,7 +100,7 @@ static void wrmsr(unsigned int idx, unsigned long value){
asm volatile("wrmsr" : : "c" (idx), "a" (low), "d" (high) : "memory");
}
static unsigned long rdpmc(unsigned int counter)
static inline unsigned long rdpmc(unsigned int counter)
{
unsigned int high, low;
@ -108,7 +109,7 @@ static unsigned long rdpmc(unsigned int counter)
return (unsigned long)high << 32 | low;
}
static unsigned long rdmsr(unsigned int index)
static inline unsigned long rdmsr(unsigned int index)
{
unsigned int high, low;
@ -117,7 +118,7 @@ static unsigned long rdmsr(unsigned int index)
return (unsigned long)high << 32 | low;
}
static unsigned long rdtsc(void)
static inline unsigned long rdtsc(void)
{
unsigned int high, low;
@ -126,7 +127,7 @@ static unsigned long rdtsc(void)
return (unsigned long)high << 32 | low;
}
static void set_perfctl(int counter, int event, int mask)
static inline void set_perfctl(int counter, int event, int mask)
{
unsigned long value;
@ -137,7 +138,7 @@ static void set_perfctl(int counter, int event, int mask)
wrmsr(MSR_PERF_CTL_0 + counter, value);
}
static void start_perfctr(int counter)
static inline void start_perfctr(int counter)
{
unsigned long value;
@ -145,7 +146,7 @@ static void start_perfctr(int counter)
value |= (1 << 22);
wrmsr(MSR_PERF_CTL_0 + counter, value);
}
static void stop_perfctr(int counter)
static inline void stop_perfctr(int counter)
{
unsigned long value;
@ -154,17 +155,17 @@ static void stop_perfctr(int counter)
wrmsr(MSR_PERF_CTL_0 + counter, value);
}
static void clear_perfctl(int counter)
static inline void clear_perfctl(int counter)
{
wrmsr(MSR_PERF_CTL_0 + counter, 0);
}
static void set_perfctr(int counter, unsigned long value)
static inline void set_perfctr(int counter, unsigned long value)
{
wrmsr(MSR_PERF_CTR_0 + counter, value);
}
static unsigned long read_perfctr(int counter)
static inline unsigned long read_perfctr(int counter)
{
return rdpmc(counter);
}

View File

@ -84,7 +84,11 @@ enum __rlimit_resource
__RLIMIT_RTPRIO = 14,
#define RLIMIT_RTPRIO __RLIMIT_RTPRIO
__RLIMIT_NLIMITS = 15,
/* timeout for RT tasks in us */
__RLIMIT_RTTIME = 15,
#define RLIMIT_RTTIME __RLIMIT_RTTIME
__RLIMIT_NLIMITS = 16,
__RLIM_NLIMITS = __RLIMIT_NLIMITS
#define RLIMIT_NLIMITS __RLIMIT_NLIMITS
#define RLIM_NLIMITS __RLIM_NLIMITS

View File

@ -74,6 +74,7 @@ SYSCALL_DELEGATED(89, readlink)
SYSCALL_HANDLED(96, gettimeofday)
SYSCALL_HANDLED(97, getrlimit)
SYSCALL_HANDLED(98, getrusage)
SYSCALL_HANDLED(99, sysinfo)
SYSCALL_HANDLED(100, times)
SYSCALL_HANDLED(101, ptrace)
SYSCALL_HANDLED(102, getuid)
@ -147,24 +148,24 @@ SYSCALL_DELEGATED(266, symlinkat)
SYSCALL_DELEGATED(267, readlinkat)
SYSCALL_DELEGATED(268, fchmodat)
SYSCALL_DELEGATED(269, faccessat)
SYSCALL_DELEGATED(270, pselect6)
SYSCALL_DELEGATED(271, ppoll)
SYSCALL_HANDLED(270, pselect6)
SYSCALL_HANDLED(271, ppoll)
SYSCALL_HANDLED(273, set_robust_list)
SYSCALL_HANDLED(279, move_pages)
SYSCALL_DELEGATED(281, epoll_pwait)
SYSCALL_HANDLED(281, epoll_pwait)
SYSCALL_HANDLED(282, signalfd)
SYSCALL_HANDLED(289, signalfd4)
#ifdef ENABLE_PERF
SYSCALL_HANDLED(298, perf_event_open)
#endif
SYSCALL_HANDLED(302, prlimit64)
#ifdef DCFA_KMOD
SYSCALL_HANDLED(303, mod_call)
#endif
SYSCALL_HANDLED(309, getcpu)
SYSCALL_HANDLED(310, process_vm_readv)
SYSCALL_HANDLED(311, process_vm_writev)
SYSCALL_HANDLED(601, pmc_init)
SYSCALL_HANDLED(602, pmc_start)
SYSCALL_HANDLED(603, pmc_stop)
SYSCALL_HANDLED(604, pmc_reset)
SYSCALL_HANDLED(322, execveat)
SYSCALL_HANDLED(700, get_cpu_id)
#ifdef PROFILE_ENABLE
SYSCALL_HANDLED(__NR_profile, profile)
@ -180,4 +181,8 @@ SYSCALL_HANDLED(802, linux_mlock)
SYSCALL_HANDLED(803, suspend_threads)
SYSCALL_HANDLED(804, resume_threads)
SYSCALL_HANDLED(811, linux_spawn)
/**** End of File ****/
/* Do not edit the lines including this comment and
* EOF just after it because those are used as a
* robust marker for the autotest patch.
*/

View File

@ -1,3 +1,4 @@
/* interrupt.S COPYRIGHT FUJITSU LIMITED 2019 */
/**
* \file interrupt.S
* License details are found in the file LICENSE.
@ -91,6 +92,9 @@ vector=vector+1
.endr
common_interrupt:
#define MULT_INTR_VECTOR 242
cmp $(MULT_INTR_VECTOR),%rdi
je 1f
PUSH_ALL_REGS
movq ERROR_OFFSET(%rsp), %rdi
movq %rsp, %rsi
@ -99,6 +103,19 @@ common_interrupt:
addq $8, %rsp
iretq
.globl nmi_handler
nmi_handler:
cld
pushq $0 /* error field of x86_basic_regs */
PUSH_ALL_REGS
movq %rsp, %rdi
call multi_nm_interrupt_handler /* Enter C code */
POP_ALL_REGS
addq $8, %rsp
iretq
.globl __page_fault_handler_address
__page_fault_handler_address:
.quad 0
@ -137,74 +154,6 @@ __freeze:
POP_ALL_REGS
iretq
.globl nmi
nmi:
#define PANICED 232
#define PANIC_REGS 240
movq %rax,%gs:PANIC_REGS+0x00
movq %rsp,%gs:PANIC_REGS+0x08
movl nmi_mode(%rip),%eax
cmp $3,%rax
je 4f
cmp $1,%rax
je 1f
cmp $2,%rax
jne 3f
1:
cld
movq %gs:PANIC_REGS+0x00,%rax
PUSH_ALL_REGS
subq $40, %rsp
movq %rsp,%gs:PANIC_REGS+0x10
movq %rsp, %rdi
call freeze_thaw
cmpq $0, %rax
jnz 2f
addq $40, %rsp
2:
POP_ALL_REGS
iretq
3:
movq %rbx,%gs:PANIC_REGS+0x08
movq %rcx,%gs:PANIC_REGS+0x10
movq %rdx,%gs:PANIC_REGS+0x18
movq %rsi,%gs:PANIC_REGS+0x20
movq %rdi,%gs:PANIC_REGS+0x28
movq %rbp,%gs:PANIC_REGS+0x30
movq 0x18(%rsp),%rax /* rsp */
movq %rax,%gs:PANIC_REGS+0x38
movq %r8, %gs:PANIC_REGS+0x40
movq %r9, %gs:PANIC_REGS+0x48
movq %r10,%gs:PANIC_REGS+0x50
movq %r11,%gs:PANIC_REGS+0x58
movq %r12,%gs:PANIC_REGS+0x60
movq %r13,%gs:PANIC_REGS+0x68
movq %r14,%gs:PANIC_REGS+0x70
movq %r15,%gs:PANIC_REGS+0x78
movq 0x00(%rsp),%rax /* rip */
movq %rax,%gs:PANIC_REGS+0x80
movq 0x10(%rsp),%rax /* rflags */
movl %eax,%gs:PANIC_REGS+0x88
movq 0x08(%rsp),%rax /* cs */
movl %eax,%gs:PANIC_REGS+0x8C
movq 0x20(%rsp),%rax /* ss */
movl %eax,%gs:PANIC_REGS+0x90
xorq %rax,%rax
movw %ds,%ax
movl %eax,%gs:PANIC_REGS+0x94
movw %es,%ax
movl %eax,%gs:PANIC_REGS+0x98
movw %fs,%ax
movl %eax,%gs:PANIC_REGS+0x9C
movw %gs,%ax
movl %eax,%gs:PANIC_REGS+0xA0
movq $1,%gs:PANICED
call ihk_mc_query_mem_areas
4:
hlt
jmp 4b
.globl x86_syscall
x86_syscall:
cld

View File

@ -14,7 +14,6 @@
*/
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <ihk/mm.h>
#include <types.h>
#include <memory.h>
@ -26,7 +25,7 @@
#include <cls.h>
#include <kmalloc.h>
#include <rusage_private.h>
#include <debug.h>
#include <ihk/debug.h>
//#define DEBUG
@ -38,6 +37,7 @@
static char *last_page;
extern char _head[], _end[];
extern unsigned long linux_page_offset_base;
extern unsigned long x86_kernel_phys_base;
/* Arch specific early allocation routine */
@ -1355,109 +1355,6 @@ struct clear_range_args {
int max_nr_addr;
};
#ifdef POSTK_DEBUG_ARCH_DEP_8
void remote_flush_tlb_cpumask(struct process_vm *vm,
unsigned long addr, int cpu_id)
{
unsigned long __addr = addr;
return remote_flush_tlb_array_cpumask(vm, &__addr, 1, cpu_id);
}
void remote_flush_tlb_array_cpumask(struct process_vm *vm,
unsigned long *addr,
int nr_addr,
int cpu_id)
{
unsigned long cpu;
int flush_ind;
struct tlb_flush_entry *flush_entry;
cpu_set_t _cpu_set;
if (addr[0]) {
flush_ind = (addr[0] >> PAGE_SHIFT) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
}
/* Zero address denotes full TLB flush */
else {
/* Random.. */
flush_ind = (rdtsc()) % IHK_TLB_FLUSH_IRQ_VECTOR_SIZE;
}
flush_entry = &tlb_flush_vector[flush_ind];
/* Take a copy of the cpu set so that we don't hold the lock
* all the way while interrupting other cores */
ihk_mc_spinlock_lock_noirq(&vm->address_space->cpu_set_lock);
memcpy(&_cpu_set, &vm->address_space->cpu_set, sizeof(cpu_set_t));
ihk_mc_spinlock_unlock_noirq(&vm->address_space->cpu_set_lock);
dkprintf("trying to aquire flush_entry->lock flush_ind: %d\n", flush_ind);
ihk_mc_spinlock_lock_noirq(&flush_entry->lock);
flush_entry->vm = vm;
flush_entry->addr = addr;
flush_entry->nr_addr = nr_addr;
ihk_atomic_set(&flush_entry->pending, 0);
dkprintf("lock aquired, iterating cpu mask.. flush_ind: %d\n", flush_ind);
/* Loop through CPUs in this address space and interrupt them for
* TLB flush on the specified address */
for_each_set_bit(cpu, (const unsigned long*)&_cpu_set.__bits, CPU_SETSIZE) {
if (ihk_mc_get_processor_id() == cpu)
continue;
ihk_atomic_inc(&flush_entry->pending);
dkprintf("remote_flush_tlb_cpumask: flush_ind: %d, addr: 0x%lX, interrupting cpu: %d\n",
flush_ind, addr, cpu);
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */
/* TODO(pka_idke) Interim support */
ihk_mc_interrupt_cpu(cpu,
ihk_mc_get_vector(flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START));
#else /* POSTK_DEBUG_ARCH_DEP_8 */
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(cpu)->apic_id,
flush_ind + IHK_TLB_FLUSH_IRQ_VECTOR_START);
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
}
#ifdef DEBUG_IC_TLB
{
unsigned long tsc;
tsc = rdtsc() + 12884901888; /* 1.2GHz =>10 sec */
#endif
if (flush_entry->addr[0]) {
int i;
for (i = 0; i < flush_entry->nr_addr; ++i) {
flush_tlb_single(flush_entry->addr[i] & PAGE_MASK);
}
}
/* Zero address denotes full TLB flush */
else {
flush_tlb();
}
/* Wait for all cores */
while (ihk_atomic_read(&flush_entry->pending) != 0) {
cpu_pause();
#ifdef DEBUG_IC_TLB
if (rdtsc() > tsc) {
kprintf("waited 10 secs for remote TLB!! -> panic_all()\n");
panic_all_cores("waited 10 secs for remote TLB!!\n");
}
#endif
}
#ifdef DEBUG_IC_TLB
}
#endif
ihk_mc_spinlock_unlock_noirq(&flush_entry->lock);
}
#endif /* POSTK_DEBUG_ARCH_DEP_8 */
static void remote_flush_tlb_add_addr(struct clear_range_args *args,
unsigned long addr)
{
@ -1622,7 +1519,7 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
{
struct clear_range_args *args = args0;
int error;
uint64_t phys;
uint64_t phys = 0;
pte_t old;
struct page *page;
struct page_table *pt;
@ -2572,10 +2469,10 @@ static void init_linux_kernel_mapping(struct page_table *pt)
map_start = 0;
map_end = 0x20000000000;
virt = (void *)LINUX_PAGE_OFFSET;
virt = (void *)linux_page_offset_base;
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
virt, virt + map_end, 0, map_end);
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
@ -2599,9 +2496,11 @@ static void init_linux_kernel_mapping(struct page_table *pt)
}
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
linux_page_offset_base + map_start,
linux_page_offset_base + map_end,
map_start, map_end);
virt = (void *)(LINUX_PAGE_OFFSET + map_start);
virt = (void *)(linux_page_offset_base + map_start);
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
@ -2652,7 +2551,7 @@ void *map_fixed_area(unsigned long phys, unsigned long size, int uncachable)
attr |= PTATTR_UNCACHABLE;
}
kprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
dkprintf("map_fixed: phys: 0x%lx => 0x%lx (%d pages)\n",
paligned, v, npages);
for (i = 0; i < npages; i++) {
@ -2745,12 +2644,12 @@ unsigned long virt_to_phys(void *v)
unsigned long va = (unsigned long)v;
if (va >= MAP_KERNEL_START) {
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= linux_page_offset_base\n",
__FUNCTION__, va);
return va - MAP_KERNEL_START + x86_kernel_phys_base;
}
else if (va >= LINUX_PAGE_OFFSET) {
return va - LINUX_PAGE_OFFSET;
else if (va >= linux_page_offset_base) {
return va - linux_page_offset_base;
}
else if (va >= MAP_FIXED_START) {
return va - MAP_FIXED_START;
@ -2769,7 +2668,7 @@ void *phys_to_virt(unsigned long p)
return (void *)(p + MAP_ST_START);
}
return (void *)(p + LINUX_PAGE_OFFSET);
return (void *)(p + linux_page_offset_base);
}
int copy_from_user(void *dst, const void *src, size_t siz)

View File

@ -12,12 +12,12 @@
#include <march.h>
#include <errno.h>
#include <cls.h>
#include <ihk/debug.h>
#include <ihk/cpu.h>
#include <registers.h>
#include <mc_perf_event.h>
#include <config.h>
#include <debug.h>
#include <ihk/debug.h>
#include <process.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
@ -223,41 +223,6 @@ int ihk_mc_perfctr_init_raw(int counter, unsigned int code, int mode)
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
}
#ifdef POSTK_DEBUG_TEMP_FIX_29
int ihk_mc_perfctr_init(int counter, uint64_t config, int mode)
#else
int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
{
#ifdef POSTK_DEBUG_TEMP_FIX_29
enum ihk_perfctr_type type;
switch (config) {
case PERF_COUNT_HW_CPU_CYCLES :
type = APT_TYPE_CYCLE;
break;
case PERF_COUNT_HW_INSTRUCTIONS :
type = APT_TYPE_INSTRUCTIONS;
break;
default :
// Not supported config.
type = PERFCTR_MAX_TYPE;
}
#endif /*POSTK_DEBUG_TEMP_FIX_29*/
if (counter < 0 || counter >= NUM_PERF_COUNTERS) {
return -EINVAL;
}
if (type < 0 || type >= PERFCTR_MAX_TYPE) {
return -EINVAL;
}
if (!x86_march_perfmap[type]) {
return -EINVAL;
}
return set_perfctr_x86_direct(counter, mode, x86_march_perfmap[type]);
}
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
{
struct thread *thread = cpu_local_var(current);
@ -412,6 +377,23 @@ int ihk_mc_perfctr_read_mask(unsigned long counter_mask, unsigned long *value)
return 0;
}
int ihk_mc_perfctr_alloc(struct thread *thread, struct mc_perf_event *event)
{
int ret = -EINVAL;
int i = 0;
const int counters = ihk_mc_perf_get_num_counters();
// find avail generic counter
for (i = 0; i < counters; i++) {
if (!(thread->pmc_alloc_map & (1 << i))) {
ret = i;
break;
}
}
return ret;
}
unsigned long ihk_mc_perfctr_read(int counter)
{
unsigned long retval = 0;
@ -439,6 +421,14 @@ unsigned long ihk_mc_perfctr_read(int counter)
return retval;
}
unsigned long ihk_mc_perfctr_value(int counter, unsigned long correction)
{
unsigned long count = ihk_mc_perfctr_read(counter) + correction;
count &= 0x000000ffffffffffL;
return count;
}
// read by rdmsr
unsigned long ihk_mc_perfctr_read_msr(int counter)
{
@ -513,3 +503,18 @@ int ihk_mc_perf_get_num_counters(void)
{
return NUM_PERF_COUNTERS;
}
int hw_perf_event_init(struct mc_perf_event *event)
{
return 0;
}
int ihk_mc_event_set_period(struct mc_perf_event *event)
{
return 0;
}
uint64_t ihk_mc_event_update(struct mc_perf_event *event)
{
return 0;
}

View File

@ -16,7 +16,6 @@
*/
#include <ihk/cpu.h>
#include <ihk/debug.h>
#include <cls.h>
#include <cpulocal.h>
#include <syscall.h>
@ -32,7 +31,8 @@
#include <page.h>
#include <limits.h>
#include <syscall.h>
#include <debug.h>
#include <rusage_private.h>
#include <ihk/debug.h>
void terminate_mcexec(int, int);
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
@ -64,7 +64,6 @@ uintptr_t debug_constants[] = {
-1,
};
#ifdef POSTK_DEBUG_ARCH_DEP_52
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
@ -80,8 +79,24 @@ struct vdso {
long hpet_phys;
void *pvti_virt;
long pvti_phys;
void *vgtod_virt;
};
struct vsyscall_gtod_data {
int seq;
struct {
int vclock_mode;
unsigned long cycle_last;
unsigned long mask;
unsigned int mult;
unsigned int shift;
} clock;
/* open coded 'struct timespec' */
time_t wall_time_sec;
unsigned long wall_time_snsec;
};
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
static struct vdso vdso;
static size_t container_size = 0;
@ -132,44 +147,6 @@ int obtain_clone_cpuid(cpu_set_t *cpu_set, int use_last) {
return min_cpu;
}
int
arch_clear_host_user_space()
{
struct thread *th = cpu_local_var(current);
/* XXX: might be unnecessary */
clear_host_pte(th->vm->region.user_start,
(th->vm->region.user_end - th->vm->region.user_start));
return 0;
}
SYSCALL_DECLARE(rt_sigaction)
{
int sig = ihk_mc_syscall_arg0(ctx);
const struct sigaction *act = (const struct sigaction *)ihk_mc_syscall_arg1(ctx);
struct sigaction *oact = (struct sigaction *)ihk_mc_syscall_arg2(ctx);
size_t sigsetsize = ihk_mc_syscall_arg3(ctx);
struct k_sigaction new_sa, old_sa;
int rc;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
if(act)
if(copy_from_user(&new_sa.sa, act, sizeof new_sa.sa)){
goto fault;
}
rc = do_sigaction(sig, act? &new_sa: NULL, oact? &old_sa: NULL);
if(rc == 0 && oact)
if(copy_to_user(oact, &old_sa.sa, sizeof old_sa.sa)){
goto fault;
}
return rc;
fault:
return -EFAULT;
}
SYSCALL_DECLARE(prctl)
{
struct process *proc = cpu_local_var(current)->proc;
@ -558,7 +535,7 @@ long ptrace_write_regset(struct thread *thread, long type, struct iovec *iov)
return rc;
}
extern void coredump(struct thread *thread, void *regs);
extern int coredump(struct thread *thread, void *regs, int sig);
void ptrace_report_signal(struct thread *thread, int sig)
{
@ -726,6 +703,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
struct mcs_rwlock_node_irqsave lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
int restart = 0;
int ret;
for(w = pending->sigmask.__val[0], sig = 0; w; sig++, w >>= 1);
dkprintf("do_signal(): tid=%d, pid=%d, sig=%d\n", thread->tid, proc->pid, sig);
@ -971,15 +949,6 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
dkprintf("SIGTRAP(): woken up\n");
break;
case SIGCONT:
memset(&info, '\0', sizeof info);
info.si_signo = SIGCHLD;
info.si_code = CLD_CONTINUED;
info._sifields._sigchld.si_pid = proc->pid;
info._sifields._sigchld.si_status = 0x0000ffff;
do_kill(cpu_local_var(current), proc->parent->pid, -1, SIGCHLD, &info, 0);
proc->main_thread->signal_flags = SIGNAL_STOP_CONTINUED;
proc->status = PS_RUNNING;
dkprintf("do_signal,SIGCONT,do nothing\n");
break;
case SIGQUIT:
case SIGILL:
@ -991,9 +960,31 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
case SIGXCPU:
case SIGXFSZ:
core:
dkprintf("do_signal,default,core,sig=%d\n", sig);
coredump(thread, regs);
coredumped = 0x80;
thread->coredump_regs =
kmalloc(sizeof(struct x86_user_context),
IHK_MC_AP_NOWAIT);
if (!thread->coredump_regs) {
kprintf("%s: Out of memory\n", __func__);
goto skip;
}
memcpy(thread->coredump_regs, regs,
sizeof(struct x86_user_context));
ret = coredump(thread, regs, sig);
switch (ret) {
case -EBUSY:
kprintf("%s: INFO: coredump not performed, try ulimit -c <non-zero>\n",
__func__);
break;
case 0:
coredumped = 0x80;
break;
default:
kprintf("%s: ERROR: coredump failed (%d)\n",
__func__, ret);
break;
}
skip:
terminate(0, sig | coredumped);
break;
case SIGCHLD:
@ -1010,80 +1001,6 @@ out:
return restart;
}
static struct sig_pending *
getsigpending(struct thread *thread, int delflag){
struct list_head *head;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
__sigset_t x;
int sig;
struct k_sigaction *k;
w = thread->sigmask.__val[0];
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for(;;) {
if (delflag) {
mcs_rwlock_writer_lock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
}
list_for_each_entry_safe(pending, next, head, list){
for(x = pending->sigmask.__val[0], sig = 0; x; sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
if(delflag ||
(sig != SIGCHLD && sig != SIGURG) ||
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)){
if(!(pending->sigmask.__val[0] & w)){
if(delflag)
list_del(&pending->list);
if (delflag) {
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
}
return pending;
}
}
}
if (delflag) {
mcs_rwlock_writer_unlock(lock, &mcs_rw_node);
}
else {
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
}
if(lock == &thread->sigpendinglock)
return NULL;
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
return NULL;
}
struct sig_pending *
hassigpending(struct thread *thread)
{
if (list_empty(&thread->sigpending) &&
list_empty(&thread->sigcommon->sigpending)) {
return NULL;
}
return getsigpending(thread, 0);
}
int
interrupt_from_user(void *regs0)
{
@ -1098,170 +1015,6 @@ void save_syscall_return_value(int num, unsigned long rc)
return;
}
/** \brief check arrived signals and processing
*
* @param rc return value of syscall
* @param regs0 context
* @param num syscall number (-1: Not called on exiting system call)
*/
void
check_signal(unsigned long rc, void *regs0, int num)
{
struct x86_user_context *regs = regs0;
struct thread *thread;
struct sig_pending *pending;
int irqstate;
if(clv == NULL)
return;
thread = cpu_local_var(current);
if(thread == NULL || thread == &cpu_local_var(idle)){
struct thread *t;
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
list_for_each_entry(t, &(cpu_local_var(runq)), sched_list){
if(t == &cpu_local_var(idle))
continue;
if(t->status == PS_INTERRUPTIBLE &&
hassigpending(t)){
t->status = PS_RUNNING;
break;
}
}
ihk_mc_spinlock_unlock(&(cpu_local_var(runq_lock)), irqstate);
goto out;
}
if(regs != NULL && !interrupt_from_user(regs)) {
goto out;
}
if (list_empty(&thread->sigpending) &&
list_empty(&thread->sigcommon->sigpending)) {
goto out;
}
for(;;){
pending = getsigpending(thread, 1);
if(!pending) {
dkprintf("check_signal,queue is empty\n");
goto out;
}
if (do_signal(rc, regs, thread, pending, num)) {
num = -1;
}
}
out:
return;
}
static int
check_sig_pending_thread(struct thread *thread)
{
int found = 0;
struct list_head *head;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
__sigset_t x;
int sig = 0;
struct k_sigaction *k;
struct cpu_local_var *v;
v = get_this_cpu_local_var();
w = thread->sigmask.__val[0];
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for (;;) {
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list){
for (x = pending->sigmask.__val[0], sig = 0; x;
sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
if ((sig != SIGCHLD && sig != SIGURG) ||
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)) {
if (!(pending->sigmask.__val[0] & w)) {
if (pending->interrupted == 0) {
pending->interrupted = 1;
found = 1;
if (sig != SIGCHLD &&
sig != SIGURG &&
!k->sa.sa_handler) {
found = 2;
break;
}
}
}
}
}
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if (found == 2) {
break;
}
if (lock == &thread->sigpendinglock) {
break;
}
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
if (found == 2) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
terminate_mcexec(0, sig);
return 1;
}
else if (found == 1) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
interrupt_syscall(thread, 0);
return 1;
}
return 0;
}
void
check_sig_pending(void)
{
struct thread *thread;
struct cpu_local_var *v;
if (clv == NULL)
return;
v = get_this_cpu_local_var();
repeat:
v->runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
list_for_each_entry(thread, &(v->runq), sched_list) {
if (thread == NULL || thread == &cpu_local_var(idle)) {
continue;
}
if (thread->in_syscall_offload == 0) {
continue;
}
if (thread->proc->group_exit_status & 0x0000000100000000L) {
continue;
}
if (check_sig_pending_thread(thread))
goto repeat;
}
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
}
unsigned long
do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
int ptracecont)
@ -1278,7 +1031,6 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
struct list_head *head = NULL;
int rc;
unsigned long irqstate = 0;
struct k_sigaction *k;
int doint;
int found = 0;
siginfo_t info0;
@ -1288,6 +1040,7 @@ do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
struct process_hash *phash = rset->process_hash;
struct mcs_rwlock_node lock;
struct mcs_rwlock_node updatelock;
struct sig_pending *pending = NULL;
if(sig > 64 || sig < 0)
return -EINVAL;
@ -1509,54 +1262,70 @@ done:
mcs_rwlock_writer_lock_noirq(savelock, &mcs_rw_node);
/* Put signal event even when handler is SIG_IGN or SIG_DFL
because target ptraced thread must call ptrace_report_signal
in check_signal */
rc = 0;
k = tthread->sigcommon->action + sig - 1;
if ((sig != SIGKILL && (tthread->ptrace & PT_TRACED)) ||
(k->sa.sa_handler != (void *)1 &&
(k->sa.sa_handler != NULL ||
(sig != SIGCHLD && sig != SIGURG)))) {
struct sig_pending *pending = NULL;
if (sig < 33) { // SIGRTMIN - SIGRTMAX
list_for_each_entry(pending, head, list){
if(pending->sigmask.__val[0] == mask &&
pending->ptracecont == ptracecont)
break;
}
if(&pending->list == head)
pending = NULL;
if (sig < 33) { // SIGRTMIN - SIGRTMAX
list_for_each_entry(pending, head, list) {
if (pending->sigmask.__val[0] == mask &&
pending->ptracecont == ptracecont)
break;
}
if(pending == NULL){
doint = 1;
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
if(!pending){
rc = -ENOMEM;
}
else{
memset(pending, 0, sizeof(struct sig_pending));
pending->sigmask.__val[0] = mask;
memcpy(&pending->info, info, sizeof(siginfo_t));
pending->ptracecont = ptracecont;
if(sig == SIGKILL || sig == SIGSTOP)
list_add(&pending->list, head);
else
list_add_tail(&pending->list, head);
tthread->sigevent = 1;
}
if (&pending->list == head)
pending = NULL;
}
if (pending == NULL) {
doint = 1;
pending = kmalloc(sizeof(struct sig_pending), IHK_MC_AP_NOWAIT);
if (!pending) {
rc = -ENOMEM;
}
else {
memset(pending, 0, sizeof(struct sig_pending));
pending->sigmask.__val[0] = mask;
memcpy(&pending->info, info, sizeof(siginfo_t));
pending->ptracecont = ptracecont;
if (sig == SIGKILL || sig == SIGSTOP)
list_add(&pending->list, head);
else
list_add_tail(&pending->list, head);
tthread->sigevent = 1;
}
}
mcs_rwlock_writer_unlock_noirq(savelock, &mcs_rw_node);
cpu_restore_interrupt(irqstate);
if (sig == SIGCONT || ptracecont == 1) {
/* Wake up the target only when stopped by SIGSTOP */
if (sched_wakeup_thread(tthread, PS_STOPPED) == 0) {
struct siginfo info;
tthread->proc->main_thread->signal_flags =
SIGNAL_STOP_CONTINUED;
tthread->proc->status = PS_RUNNING;
memset(&info, '\0', sizeof(info));
info.si_signo = SIGCHLD;
info.si_code = CLD_CONTINUED;
info._sifields._sigchld.si_pid = tthread->proc->pid;
info._sifields._sigchld.si_status = 0x0000ffff;
do_kill(tthread, tthread->proc->parent->pid, -1,
SIGCHLD, &info, 0);
tthread->proc->status = PS_RUNNING;
if (thread != tthread) {
ihk_mc_interrupt_cpu(tthread->cpu_id,
ihk_mc_get_vector(IHK_GV_IKC));
}
doint = 0;
}
}
if (doint && !(mask & tthread->sigmask.__val[0])) {
int status = tthread->status;
if (thread != tthread) {
dkprintf("do_kill,ipi,pid=%d,cpu_id=%d\n",
tproc->pid, tthread->cpu_id);
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(tthread->cpu_id)->apic_id, 0xd0);
ihk_mc_interrupt_cpu(tthread->cpu_id,
ihk_mc_get_vector(IHK_GV_IKC));
}
if (status != PS_RUNNING) {
@ -1564,11 +1333,6 @@ done:
/* Wake up the target only when stopped by ptrace-reporting */
sched_wakeup_thread(tthread, PS_TRACED | PS_STOPPED | PS_INTERRUPTIBLE);
}
else if(sig == SIGCONT || ptracecont == 1){
/* Wake up the target only when stopped by SIGSTOP */
sched_wakeup_thread(tthread, PS_STOPPED);
tthread->proc->status = PS_RUNNING;
}
else {
sched_wakeup_thread(tthread, PS_INTERRUPTIBLE);
}
@ -1593,7 +1357,7 @@ set_signal(int sig, void *regs0, siginfo_t *info)
}
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
coredump(thread, regs0);
coredump(thread, regs0, sig);
terminate(0, sig | 0x80);
}
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
@ -1629,7 +1393,7 @@ SYSCALL_DECLARE(mmap)
;
const uintptr_t addr0 = ihk_mc_syscall_arg0(ctx);
const size_t len0 = ihk_mc_syscall_arg1(ctx);
size_t len0 = ihk_mc_syscall_arg1(ctx);
const int prot = ihk_mc_syscall_arg2(ctx);
const int flags0 = ihk_mc_syscall_arg3(ctx);
const int fd = ihk_mc_syscall_arg4(ctx);
@ -1668,7 +1432,9 @@ SYSCALL_DECLARE(mmap)
if (flags & MAP_HUGETLB) {
switch (flags & (0x3F << MAP_HUGE_SHIFT)) {
case 0:
flags |= MAP_HUGE_2MB; /* default hugepage size */
/* default hugepage size */
flags |= ihk_mc_get_linux_default_huge_page_shift() <<
MAP_HUGE_SHIFT;
break;
case MAP_HUGE_2MB:
@ -1684,16 +1450,29 @@ SYSCALL_DECLARE(mmap)
}
pgsize = (size_t)1 << ((flags >> MAP_HUGE_SHIFT) & 0x3F);
/* Round-up map length by pagesize */
len0 = ALIGN(len0, pgsize);
if (rusage_check_overmap(len0,
(flags >> MAP_HUGE_SHIFT) & 0x3F)) {
error = -ENOMEM;
goto out;
}
}
#define VALID_DUMMY_ADDR ((region->user_start + PTL3_SIZE - 1) & ~(PTL3_SIZE - 1))
addr = (flags & MAP_FIXED)? addr0: VALID_DUMMY_ADDR;
addr = addr0;
len = (len0 + pgsize - 1) & ~(pgsize - 1);
recheck:
if ((addr & (pgsize - 1))
|| (len == 0)
|| !(flags & (MAP_SHARED | MAP_PRIVATE))
|| ((flags & MAP_SHARED) && (flags & MAP_PRIVATE))
|| (off0 & (pgsize - 1))) {
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
addr = VALID_DUMMY_ADDR;
goto recheck;
}
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):EINVAL\n",
addr0, len0, prot, flags0, fd, off0);
error = -EINVAL;
@ -1703,6 +1482,10 @@ SYSCALL_DECLARE(mmap)
if (addr < region->user_start
|| region->user_end <= addr
|| len > (region->user_end - region->user_start)) {
if (!(flags & MAP_FIXED) && addr != VALID_DUMMY_ADDR) {
addr = VALID_DUMMY_ADDR;
goto recheck;
}
ekprintf("sys_mmap(%lx,%lx,%x,%x,%x,%lx):ENOMEM\n",
addr0, len0, prot, flags0, fd, off0);
error = -ENOMEM;
@ -1730,10 +1513,20 @@ out:
SYSCALL_DECLARE(clone)
{
return do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
struct process *proc = cpu_local_var(current)->proc;
struct mcs_rwlock_node_irqsave lock_dump;
unsigned long ret;
/* mutex coredump */
mcs_rwlock_reader_lock(&proc->coredump_lock, &lock_dump);
ret = do_fork((int)ihk_mc_syscall_arg0(ctx), ihk_mc_syscall_arg1(ctx),
ihk_mc_syscall_arg2(ctx), ihk_mc_syscall_arg3(ctx),
ihk_mc_syscall_arg4(ctx), ihk_mc_syscall_pc(ctx),
ihk_mc_syscall_sp(ctx));
mcs_rwlock_reader_unlock(&proc->coredump_lock, &lock_dump);
return ret;
}
SYSCALL_DECLARE(fork)
@ -1761,7 +1554,9 @@ SYSCALL_DECLARE(shmget)
int hugeshift = shmflg & (0x3F << SHM_HUGE_SHIFT);
if (hugeshift == 0) {
shmflg |= SHM_HUGE_2MB; /* default hugepage size */
/* default hugepage size */
shmflg |= ihk_mc_get_linux_default_huge_page_shift() <<
MAP_HUGE_SHIFT;
} else if (hugeshift == SHM_HUGE_2MB ||
hugeshift == SHM_HUGE_1GB) {
/*nop*/
@ -2210,7 +2005,7 @@ int do_process_vm_read_writev(int pid,
}
/* Check if parameters are okay */
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&lthread->vm->memory_range_lock);
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov,
@ -2232,7 +2027,7 @@ int do_process_vm_read_writev(int pid,
ret = 0;
arg_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
@ -2301,7 +2096,7 @@ arg_out:
if (pli != li) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&lthread->vm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(lthread->vm,
@ -2331,7 +2126,7 @@ arg_out:
ret = 0;
pli_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
@ -2344,7 +2139,7 @@ pli_out:
if (pri != ri) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
ihk_rwspinlock_read_lock_noirq(&rvm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(rvm,
@ -2374,7 +2169,7 @@ pli_out:
ret = 0;
pri_out:
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
ihk_rwspinlock_read_unlock_noirq(&rvm->memory_range_lock);
if (ret != 0) {
goto out;
@ -2811,4 +2606,46 @@ time_t time(void) {
return ret;
}
void calculate_time_from_tsc(struct timespec *ts)
{
unsigned long seq;
unsigned long seq2;
unsigned long ns;
unsigned long delta;
struct vsyscall_gtod_data *gtod = vdso.vgtod_virt;
do {
for (;;) {
seq = ACCESS_ONCE(gtod->seq);
if (unlikely(seq & 1)) {
cpu_pause();
continue;
}
break;
}
rmb(); /* fetch sequence before time */
ts->tv_sec = gtod->wall_time_sec;
ns = gtod->wall_time_snsec;
delta = rdtsc() - gtod->clock.cycle_last;
ns += delta * gtod->clock.mult;
ns >>= gtod->clock.shift;
seq2 = ACCESS_ONCE(gtod->seq);
rmb(); /* fetch time before checking sequence */
} while (seq != seq2);
ts->tv_nsec = ns;
if (ts->tv_nsec >= NS_PER_SEC) {
ts->tv_nsec -= NS_PER_SEC;
++ts->tv_sec;
}
}
extern void ptrace_syscall_event(struct thread *thread);
long arch_ptrace_syscall_event(struct thread *thread,
ihk_mc_user_context_t *ctx, long setret)
{
ihk_mc_syscall_ret(ctx) = setret;
ptrace_syscall_event(thread);
return ihk_mc_syscall_ret(ctx);
}
/*** End of File ***/

View File

@ -1,10 +0,0 @@
[Unit]
Description=irqbalance daemon
After=syslog.target
[Service]
EnvironmentFile=/tmp/irqbalance_mck
ExecStart=/usr/sbin/irqbalance --foreground $IRQBALANCE_ARGS
[Install]
WantedBy=multi-user.target

View File

@ -1,150 +0,0 @@
# mcoverlay-create-smp-x86.sh.in COPYRIGHT FUJITSU LIMITED 2018
# Overlay /proc, /sys with McKernel specific contents
#
# Revert any state that has been initialized before the error occured.
#
if [ -z "$(declare -f error_exit)" ]; then
error_exit() {
local status=$1
case $status in
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
fi
;&
mcos_proc_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_proc
fi
;&
mcoverlayfs_loaded)
if [ "$enable_mcoverlay" == "yes" ]; then
rmmod mcoverlay 2>/dev/null
fi
;&
linux_proc_bind_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/linux_proc
fi
;&
tmp_mcos_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos
fi
;&
tmp_mcos_created)
if [ "$enable_mcoverlay" == "yes" ]; then
rm -rf /tmp/mcos
fi
;&
initial)
# Nothing more to revert
;;
esac
# Retun -EINVAL
exit -22
}
fi
if [ ! -e /tmp/mcos ]; then
mkdir -p /tmp/mcos;
fi
if ! mount -t tmpfs tmpfs /tmp/mcos; then
echo "error: mount /tmp/mcos" >&2
error_exit "tmp_mcos_created"
fi
if [ ! -e /tmp/mcos/linux_proc ]; then
mkdir -p /tmp/mcos/linux_proc;
fi
if ! mount --bind /proc /tmp/mcos/linux_proc; then
echo "error: mount /tmp/mcos/linux_proc" >&2
error_exit "tmp_mcos_mounted"
fi
if ! taskset -c 0 insmod @KMODDIR@/mcoverlay.ko 2>/dev/null; then
echo "error: inserting mcoverlay.ko" >&2
error_exit "linux_proc_bind_mounted"
fi
while [ ! -e /proc/mcos0 ]
do
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_proc ]; then
mkdir -p /tmp/mcos/mcos0_proc;
fi
if [ ! -e /tmp/mcos/mcos0_proc_upper ]; then
mkdir -p /tmp/mcos/mcos0_proc_upper;
fi
if [ ! -e /tmp/mcos/mcos0_proc_work ]; then
mkdir -p /tmp/mcos/mcos0_proc_work;
fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/proc/mcos0:/proc,upperdir=/tmp/mcos/mcos0_proc_upper,workdir=/tmp/mcos/mcos0_proc_work,nocopyupw,nofscheck /tmp/mcos/mcos0_proc; then
echo "error: mounting /tmp/mcos/mcos0_proc" >&2
error_exit "mcoverlayfs_loaded"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /proc
while [ ! -e /sys/devices/virtual/mcos/mcos0/sys/setup_complete ]
do
sleep 0.1
done
if [ ! -e /tmp/mcos/mcos0_sys ]; then
mkdir -p /tmp/mcos/mcos0_sys;
fi
if [ ! -e /tmp/mcos/mcos0_sys_upper ]; then
mkdir -p /tmp/mcos/mcos0_sys_upper;
fi
if [ ! -e /tmp/mcos/mcos0_sys_work ]; then
mkdir -p /tmp/mcos/mcos0_sys_work;
fi
if ! mount -t mcoverlay mcoverlay -o lowerdir=/sys/devices/virtual/mcos/mcos0/sys:/sys,upperdir=/tmp/mcos/mcos0_sys_upper,workdir=/tmp/mcos/mcos0_sys_work,nocopyupw,nofscheck /tmp/mcos/mcos0_sys; then
echo "error: mount /tmp/mcos/mcos0_sys" >&2
error_exit "mcos_proc_mounted"
fi
# TODO: How de we revert this in case of failure??
mount --make-rprivate /sys
touch /tmp/mcos/mcos0_proc/mckernel
rm -rf /tmp/mcos/mcos0_sys/setup_complete
# Hide NUMA related files which are outside the LWK partition
for cpuid in `find /sys/devices/system/cpu/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/drivers/processor/$cpuid
else
for nodeid in `find /sys/devices/system/cpu/$cpuid/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/cpu/$cpuid/$nodeid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/cpu/$cpuid/$nodeid
fi
done
fi
done
for nodeid in `find /sys/devices/system/node/* -maxdepth 0 -name "node[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid" ]; then
rm -rf /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/*
rm -rf /tmp/mcos/mcos0_sys/bus/node/devices/$nodeid
else
# Delete non-existent symlinks
for cpuid in `find /sys/devices/system/node/$nodeid/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/devices/system/node/$nodeid/$cpuid" ]; then
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/$cpuid
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/$nodeid/memory*
fi
done
rm -f /tmp/mcos/mcos0_sys/devices/system/node/has_*
for cpuid in `find /sys/bus/cpu/devices/* -maxdepth 0 -name "cpu[0123456789]*" -printf "%f "`; do
if [ ! -e "/sys/devices/virtual/mcos/mcos0/sys/bus/cpu/devices/$cpuid" ]; then
rm -rf /tmp/mcos/mcos0_sys/bus/cpu/devices/$cpuid
fi
done
exit 0

View File

@ -1,16 +0,0 @@
# Remove mcoverlay if loaded
if grep mcoverlay /proc/modules &>/dev/null; then
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_sys`" != "" ]; then umount -l /tmp/mcos/mcos0_sys; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/mcos0_proc`" != "" ]; then umount -l /tmp/mcos/mcos0_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos/linux_proc`" != "" ]; then umount -l /tmp/mcos/linux_proc; fi
if [ "`cat /proc/mounts | grep /tmp/mcos`" != "" ]; then umount -l /tmp/mcos; fi
if [ -e /tmp/mcos ]; then rm -rf /tmp/mcos; fi
if ! rmmod mcoverlay 2>/dev/null; then
echo "error: removing mcoverlay" >&2
# Return -EINVAL
exit -22
fi
fi
exit 0

View File

@ -0,0 +1,383 @@
# Helper functions for translating autoconf projects. Several functions
# are lifted from the Mono sources
include (CheckCSourceCompiles)
include (CheckIncludeFile)
include (TestBigEndian)
include (CheckFunctionExists)
include (CheckTypeSize)
include (CheckCSourceRuns)
# Function to get the version information from the configure.ac file in the
# current directory. Its argument is the name of the library as passed to
# AC_INIT. It will set the variables ${LIBNAME}_VERSION and ${LIBNAME}_SOVERSION
function (ac_get_version libname)
string(TOUPPER "${libname}" libname_upper)
# Read the relevant content from configure.ac
file (STRINGS configure.ac tmp_configure_ac
REGEX "${libname_upper}_[_A-Z]+=[ \\t]*[0-9]+")
# Product version
string (REGEX REPLACE ".+MAJOR[_A-Z]+=([0-9]+).+MINOR[_A-Z]+=([0-9]+).+MICRO[_A-Z]+=([0-9]+).*"
"\\1.\\2.\\3" ${libname_upper}_VERSION "${tmp_configure_ac}")
# Library version for libtool
string (REGEX REPLACE ".+CURRENT=([0-9]+).+REVISION=([0-9]+).+AGE=([0-9]+).*"
"\\1.\\2.\\3" ${libname_upper}_SOVERSION "${tmp_configure_ac}")
# Checks if the string needs to be displayed
set (${libname_upper}_DISPLAYSTR_AUX
"Found ${libname} version ${${libname_upper}_VERSION}, soversion ${${libname_upper}_SOVERSION} from configure.ac"
)
if ((NOT ${libname_upper}_DISPLAYSTR) OR (NOT ${libname_upper}_DISPLAYSTR STREQUAL ${libname_upper}_DISPLAYSTR_AUX))
set (${libname_upper}_DISPLAYSTR ${${libname_upper}_DISPLAYSTR_AUX}
CACHE INTERNAL "Version string from ${libname}" FORCE)
message (STATUS ${${libname_upper}_DISPLAYSTR})
endif ()
# Export the result to the caller
set(${libname_upper}_VERSION "${${libname_upper}_VERSION}" PARENT_SCOPE)
set(${libname_upper}_SOVERSION "${${libname_upper}_SOVERSION}" PARENT_SCOPE)
endfunction()
# Also from mono's source code
# Implementation of AC_CHECK_HEADERS
# In addition, it also records the list of variables in the variable
# 'autoheader_vars', and for each variable, a documentation string in the
# variable ${var}_doc
function(ac_check_headers)
foreach (header ${ARGV})
string(TOUPPER ${header} header_var)
string(REPLACE "." "_" header_var ${header_var})
string(REPLACE "/" "_" header_var ${header_var})
set(header_var "HAVE_${header_var}")
check_include_file (${header} ${header_var})
set("${header_var}_doc" "Define to 1 if you have the <${header}> header file." PARENT_SCOPE)
if (${header_var})
set("${header_var}_defined" "1" PARENT_SCOPE)
endif()
set("${header_var}_val" "1" PARENT_SCOPE)
set (autoheader_vars ${autoheader_vars} ${header_var})
endforeach()
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
endfunction()
# Function taken from mono's source code
function (ac_check_funcs)
foreach (func ${ARGV})
string(TOUPPER ${func} var)
set(var "HAVE_${var}")
set(${var})
check_function_exists (${func} ${var})
set("${var}_doc" "Define to 1 if you have the '${func}' function." PARENT_SCOPE)
if (${var})
set("${var}_defined" "1" PARENT_SCOPE)
set(${var} yes PARENT_SCOPE)
endif()
set("${var}_val" "1" PARENT_SCOPE)
set (autoheader_vars ${autoheader_vars} ${var})
endforeach()
set (autoheader_vars ${autoheader_vars} PARENT_SCOPE)
endfunction()
# Specifically, this macro checks for stdlib.h', stdarg.h',
# string.h', and float.h'; if the system has those, it probably
# has the rest of the ANSI C header files. This macro also checks
# whether string.h' declares memchr' (and thus presumably the
# other mem' functions), whether stdlib.h' declare free' (and
# thus presumably malloc' and other related functions), and whether
# the ctype.h' macros work on characters with the high bit set, as
# ANSI C requires.
function (ac_header_stdc)
if (STDC_HEADERS)
return()
endif()
message(STATUS "Looking for ANSI-C headers")
set(code "
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <float.h>
int main(int argc, char **argv)
{
void *ptr;
free((void*)1);
ptr = memchr((void*)1, 0, 0);
return (int)ptr;
}
")
# FIXME Check the ctype.h high bit
CHECK_C_SOURCE_COMPILES("${code}" STDC_HEADERS)
if (STDC_HEADERS)
set(STDC_HEADERS 1 PARENT_SCOPE)
message(STATUS "Looking for ANSI-C headers - found")
else()
message(STATUS "Looking for ANSI-C headers - not found")
endif()
endfunction()
# Also from the mono sources, kind of implements AC_SYS_LARGEFILE
function (ac_sys_largefile)
CHECK_C_SOURCE_RUNS("
#include <sys/types.h>
#define BIG_OFF_T (((off_t)1<<62)-1+((off_t)1<<62))
int main (int argc, char **argv) {
int big_off_t=((BIG_OFF_T%2147483629==721) &&
(BIG_OFF_T%2147483647==1));
return big_off ? 0 : 1;
}
" HAVE_LARGE_FILE_SUPPORT)
# Check if it makes sense to define _LARGE_FILES or _FILE_OFFSET_BITS
if (HAVE_LARGE_FILE_SUPPORT)
return()
endif()
set (_LARGE_FILE_EXTRA_SRC "
#include <sys/types.h>
int main (int argc, char **argv) {
return sizeof(off_t) == 8 ? 0 : 1;
}
")
CHECK_C_SOURCE_RUNS ("#define _LARGE_FILES\n${_LARGE_FILE_EXTRA_SRC}"
HAVE_USEFUL_D_LARGE_FILES)
if (NOT HAVE_USEFUL_D_LARGE_FILES)
if (NOT DEFINED HAVE_USEFUL_D_FILE_OFFSET_BITS)
set (SHOW_LARGE_FILE_WARNING TRUE)
endif ()
CHECK_C_SOURCE_RUNS ("#define _FILE_OFFSET_BITS 64\n${_LARGE_FILE_EXTRA_SRC}"
HAVE_USEFUL_D_FILE_OFFSET_BITS)
if (HAVE_USEFUL_D_FILE_OFFSET_BITS)
set (_FILE_OFFSET_BITS 64 PARENT_SCOPE)
elseif (SHOW_LARGE_FILE_WARNING)
message (WARNING "No 64 bit file support through off_t available.")
endif ()
else ()
set (_LARGE_FILES 1 PARENT_SCOPE)
endif ()
endfunction ()
# Quick way to set some basic variables
# FIXME add support for variable number of arguments: only package and version are mandatory
# arguments are package version bug_report tarname url
function (ac_init)
set(package ${ARGV0})
set(version ${ARGV1})
set(bug_report ${ARGV2})
set(tarname ${ARGV3})
set(url ${ARGV4})
set(PACKAGE_NAME "\"${package}\"" PARENT_SCOPE)
set(PACKAGE_VERSION "\"${version}\"" PARENT_SCOPE)
set(VERSION "\"${version}\"" PARENT_SCOPE)
if(version)
set(PACKAGE_STRING "\"${package} ${version}\"" PARENT_SCOPE)
else()
set(PACKAGE_STRING "\"${package}\"" PARENT_SCOPE)
endif()
set(PACKAGE_BUGREPORT "\"${bug_report}\"" PARENT_SCOPE)
if(NOT tarname)
string(REGEX REPLACE "[^a-zA-Z0-9_]" "-" tarname "${package}")
endif()
set(PACKAGE_TARNAME "\"${tarname}\"" PARENT_SCOPE)
set(PACKAGE_URL "\"${url}\"" PARENT_SCOPE)
endfunction()
# Checks for the const keyword, defining "HAS_CONST_SUPPORT"
# If it does not have support, defines "const" to 0 in the parent scope
function (ac_c_const)
CHECK_C_SOURCE_COMPILES(
"int main(int argc, char **argv){const int r = 0;return r;}"
HAS_CONST_SUPPORT)
if (NOT HAS_CONST_SUPPORT)
set(const 0 PARENT_SCOPE)
endif()
endfunction()
# Inline keyword support. Defines "inline" in the parent scope to the
# compiler internal keyword for inline in C
# TODO write a better test!
function (ac_c_inline)
if (MSVC)
set (inline __inline)
elseif(CMAKE_COMPILER_IS_GNUC)
set (inline __inline__)
endif()
set(inline "${inline}" PARENT_SCOPE)
endfunction()
# Test if you can safely include both <sys/time.h> and <time.h>
function (ac_header_time)
CHECK_C_SOURCE_COMPILES(
"#include <sys/time.h>\n#include <time.h>\nint main(int argc, char **argv) { return 0; }"
TIME_WITH_SYS_TIME)
set(TIME_WITH_SYS_TIME ${TIME_WITH_SYS_TIME} PARENT_SCOPE)
endfunction()
# Native cpu byte order: 1 if big-endian (Motorola) or 0 if little-endian
# (Intel), setting "WORDS_BIGENDIAN" to 1 if big endian
function (ac_c_bigendian)
TEST_BIG_ENDIAN(HOST_BIGENDIAN)
if (HOST_BIGENDIAN)
set(WORDS_BIGENDIAN 1 PARENT_SCOPE)
endif()
endfunction()
# Check for off_t, setting "off_t" in the parent scope
function(ac_type_off_t)
CHECK_TYPE_SIZE("off_t" SIZEOF_OFF_T)
if (NOT SIZEOF_OFF_T)
set(off_t "long int")
endif()
set(off_t ${off_t} PARENT_SCOPE)
endfunction()
# Check for size_t, setting "size_t" in the parent scope
function(ac_type_size_t)
CHECK_TYPE_SIZE("size_t" SIZEOF_SIZE_T)
if (NOT SIZEOF_SIZE_T)
set(size_t "unsigned int")
endif()
set(size_t ${size_t} PARENT_SCOPE)
endfunction()
# Define "TM_IN_SYS_TIME" to 1 if <sys/time.h> declares "struct tm"
function(ac_struct_tm)
CHECK_C_SOURCE_COMPILES(
"#include <sys/time.h>\nint main(int argc, char **argv) { struct tm x; return 0; }"
TM_IN_SYS_TIME
)
if (TM_IN_SYS_TIME)
set (TM_IN_SYS_TIME 1 PARENT_SCOPE)
endif()
endfunction()
# Obtain size of an 'type' and define as SIZEOF_TYPE
function (ac_check_sizeof typename)
string(TOUPPER "SIZEOF_${typename}" varname)
string(REPLACE " " "_" varname "${varname}")
string(REPLACE "*" "p" varname "${varname}")
CHECK_TYPE_SIZE("${typename}" ${varname} BUILTIN_TYPES_ONLY)
if(NOT ${varname})
set(${varname} 0 PARENT_SCOPE)
endif()
endfunction()
# Check if the type exists, defines HAVE_<type>
function (ac_check_type typename)
string(TOUPPER "${typename}" varname)
string(REPLACE " " "_" varname "${varname}")
string(REPLACE "*" "p" varname "${varname}")
CHECK_TYPE_SIZE("${typename}" ${varname})
if (NOT "${varname}" STREQUAL "")
set("HAVE_${varname}" 1 PARENT_SCOPE)
set("${varname}" "${typename}" PARENT_SCOPE)
else()
set("${varname}" "unknown" PARENT_SCOPE)
endif()
endfunction()
# Verifies if each type on the list exists, using the given prelude
function (ac_check_types type_list prelude)
foreach(typename ${type_list})
string(TOUPPER "HAVE_${typename}" varname)
string(REPLACE " " "_" varname "${varname}")
string(REPLACE "*" "p" varname "${varname}")
CHECK_C_SOURCE_COMPILES("${prelude}\n ${typename} foo;" ${varname})
endforeach()
endfunction()
function(ac_path_prog variable prog_to_check_for value_if_not_found env_var)
find_program(${variable} NAMES ${prog_to_check_for} PATHS ENV ${env_var} NO_DEFAULT_PATH)
if(NOT ${variable})
message(STATUS "Looking for ${prog_to_check_for} - not found")
set(${variable} ${value_if_not_fount} PARENT_SCOPE)
else()
message(STATUS "Looking for ${prog_to_check_for} - ${variable}")
set(${variable} ${${variable}} PARENT_SCOPE)
endif()
endfunction()
# check if function func exists in library lib
function(ac_check_lib lib func)
string(TOUPPER "HAVE_${func}" varname)
set(CMAKE_REQUIRED_LIBRARIES ${lib})
check_function_exists(${func} ${varname})
set(CMAKE_REQUIRED_LIBRARIES)
endfunction()
# check if source compiles without linking
function(ac_try_compile SOURCE VAR)
set(CMAKE_TMP_DIR ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeTmp)
if(NOT DEFINED "${VAR}")
file(WRITE
"${CMAKE_TMP_DIR}/src.c"
"${SOURCE}\n"
)
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Performing Test ${VAR}")
endif()
# Set up CMakeLists.txt for static library:
file(WRITE
${CMAKE_TMP_DIR}/CMakeLists.txt
"add_library(compile STATIC src.c)"
)
# Configure:
execute_process(
COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
WORKING_DIRECTORY ${CMAKE_TMP_DIR}
)
# Build:
execute_process(
COMMAND ${CMAKE_COMMAND} --build ${CMAKE_TMP_DIR}
RESULT_VARIABLE RESVAR
OUTPUT_VARIABLE OUTPUT
ERROR_VARIABLE OUTPUT
)
# Set up result:
if(RESVAR EQUAL 0)
set(${VAR} 1 CACHE INTERNAL "Test ${VAR}")
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Performing Test ${VAR} - Success")
endif()
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeOutput.log
"Performing C SOURCE FILE Test ${VAR} succeded with the following output:\n"
"${OUTPUT}\n"
"Source file was:\n${SOURCE}\n")
else()
if(NOT CMAKE_REQUIRED_QUIET)
message(STATUS "Performing Test ${VAR} - Failed")
endif()
set(${VAR} "" CACHE INTERNAL "Test ${VAR}")
file(APPEND ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/CMakeError.log
"Performing C SOURCE FILE Test ${VAR} failed with the following output:\n"
"${OUTPUT}\n"
"Source file was:\n${SOURCE}\n")
endif()
endif()
endfunction()

View File

@ -0,0 +1,64 @@
# - Try to find libelf
# Once done this will define
#
# LIBELF_FOUND - system has libelf
# LIBELF_INCLUDE_DIRS - the libelf include directory
# LIBELF_LIBRARIES - Link these to use libelf
# LIBELF_DEFINITIONS - Compiler switches required for using libelf
#
# This module reads hints about search locations from variables:
#
# LIBELF_ROOT - Preferred installation prefix
#
# Copyright (c) 2008 Bernhard Walle <bernhard.walle@gmx.de>
#
# Redistribution and use is allowed according to the terms of the New
# BSD license.
# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
#
if (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
set (LibElf_FIND_QUIETLY TRUE)
endif (LIBELF_LIBRARIES AND LIBELF_INCLUDE_DIRS)
find_path (LIBELF_INCLUDE_DIRS
NAMES
libelf/libelf.h libelf.h
HINTS
${LIBELF_ROOT}
PATH_SUFFIXES
include
libelf/include
)
find_library (LIBELF_LIBRARIES
NAMES
elf libelf
HINTS
${LIBELF_ROOT}
PATH_SUFFIXES
lib
libelf/lib
)
include (FindPackageHandleStandardArgs)
# handle the QUIETLY and REQUIRED arguments and set LIBELF_FOUND to TRUE if all listed variables are TRUE
FIND_PACKAGE_HANDLE_STANDARD_ARGS(LibElf DEFAULT_MSG
LIBELF_LIBRARIES
LIBELF_INCLUDE_DIRS)
set(CMAKE_REQUIRED_LIBRARIES elf)
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("#include <libelf.h>
int main() {
Elf *e = (Elf*)0;
size_t sz;
elf_getshdrstrndx(e, &sz);
return 0;
}" ELF_GETSHDRSTRNDX)
unset(CMAKE_REQUIRED_LIBRARIES)
mark_as_advanced(LIBELF_INCLUDE_DIRS LIBELF_LIBRARIES ELF_GETSHDRSTRNDX)

View File

@ -14,6 +14,28 @@ mark_as_advanced(
KBUILD_MAKE_FLAGS
)
if (${CMAKE_GENERATOR} STREQUAL Ninja)
set(MAKE "make")
list(APPEND KBUILD_MAKE_FLAGS "-j")
else ()
set(MAKE "$(MAKE)")
endif ()
# Convert McKernel "arm64" into Linux "aarch64"
if ("${ARCH}" STREQUAL "arm64")
set(LINUX_ARCH "aarch64")
else ()
set(LINUX_ARCH "${ARCH}")
endif ()
if (NOT "${LINUX_ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH}")
list(APPEND KBUILD_MAKE_FLAGS "CROSS_COMPILE=${CROSS_COMPILE}")
endif()
string(REPLACE ";" " " KBUILD_MAKE_FLAGS_STR "${KBUILD_MAKE_FLAGS}")
function(kmod MODULE_NAME)
cmake_parse_arguments(KMOD "" "INSTALL_DEST" "C_FLAGS;SOURCES;EXTRA_SYMBOLS;DEPENDS" ${ARGN})
@ -33,17 +55,6 @@ endif(ENABLE_WERROR)
configure_file(${CMAKE_SOURCE_DIR}/cmake/modules/Kbuild.in
${CMAKE_CURRENT_BINARY_DIR}/Kbuild)
if (${CMAKE_GENERATOR} STREQUAL Ninja)
set(MAKE "make")
list(APPEND KBUILD_MAKE_FLAGS "-j")
else ()
set(MAKE "$(MAKE)")
endif ()
if (NOT "${ARCH}" STREQUAL "${CMAKE_HOST_SYSTEM_PROCESSOR}")
string(REGEX REPLACE "ld$" "" CROSS_COMPILE "${CMAKE_LINKER}")
list(APPEND KBUILD_MAKE_FLAGS "ARCH=${ARCH};CROSS_COMPILE=${CROSS_COMPILE}")
endif()
string(REGEX REPLACE "\\.c(;|$)" ".o.cmd\\1" KMOD_O_CMD "${KMOD_SOURCES}")
string(REGEX REPLACE "[^/;]+(;|$)" ".\\0" KMOD_O_CMD "${KMOD_O_CMD}")
@ -78,6 +89,10 @@ endif(ENABLE_WERROR)
# the native build system do these checks, if possible at all...
add_custom_command(OUTPUT kmod_always_rebuild COMMAND touch kmod_always_rebuild)
if (NOT EXISTS "${KERNEL_DIR}/Makefile")
message(FATAL_ERROR "${KERNEL_DIR} does not contain a Makefile and is probably missing. install kernel development package or set the KERNEL_DIR variable")
endif()
add_custom_command(
OUTPUT "${MODULE_NAME}.ko"
"Module.symvers"

View File

@ -6,8 +6,9 @@
/* version number */
#define MCKERNEL_VERSION "${MCKERNEL_VERSION}"
/* whether mcoverlayfs is enabled */
#cmakedefine ENABLE_MCOVERLAYFS 1
/* enable the required code for mcexec to be able to use bind mount
* there is no config option as its use is discouraged */
// #define MCEXEC_BIND_MOUNT 1
/* whether memdump feature is enabled */
#cmakedefine ENABLE_MEMDUMP 1
@ -27,18 +28,12 @@
/* whether undefined behaviour sanitizer is enabled */
#cmakedefine ENABLE_UBSAN 1
/* whether per-CPU allocator cache (ThunderX2 workaround) is enabled */
#cmakedefine ENABLE_PER_CPU_ALLOC_CACHE 1
/* Path of bind-mount source directory */
#cmakedefine ROOTFSDIR "${ROOTFSDIR}"
/* Path of install directory for libraries */
#cmakedefine MCKERNEL_LIBDIR "${MCKERNEL_LIBDIR}"
/* Path of install directory for binary */
#cmakedefine BINDIR "${BINDIR}"
/* Path of install directory for system binary */
#cmakedefine SBINDIR "${SBINDIR}"
/* for non-RHEL kernels */
#ifndef RHEL_RELEASE_VERSION
#define RHEL_RELEASE_VERSION(a,b) (((a) << 8) + (b))

View File

@ -1,27 +0,0 @@
#ifndef IHKLIB_RUSAGE_H_INCLUDED
#define IHKLIB_RUSAGE_H_INCLUDED
#define IHK_MAX_NUM_PGSIZES 4
#define IHK_MAX_NUM_NUMA_NODES 1024
#define IHK_MAX_NUM_CPUS 1024
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_2MB 1
#define IHK_OS_PGSIZE_1GB 2
struct mckernel_rusage {
unsigned long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
unsigned long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];
unsigned long memory_max_usage;
unsigned long memory_kmem_usage;
unsigned long memory_kmem_max_usage;
unsigned long memory_numa_stat[IHK_MAX_NUM_NUMA_NODES];
unsigned long cpuacct_stat_system;
unsigned long cpuacct_stat_user;
unsigned long cpuacct_usage;
unsigned long cpuacct_usage_percpu[IHK_MAX_NUM_CPUS];
int num_threads;
int max_num_threads;
};
#endif /* !defined(IHKLIB_RUSAGE_H_INCLUDED) */

View File

@ -55,7 +55,7 @@
#define MCEXEC_UP_SYS_UNSHARE 0x30a02916
#define MCEXEC_UP_UTI_GET_CTX 0x30a02920
#define MCEXEC_UP_UTI_SAVE_FS 0x30a02921
#define MCEXEC_UP_UTI_SWITCH_CTX 0x30a02921
#define MCEXEC_UP_SIG_THREAD 0x30a02922
#define MCEXEC_UP_SYSCALL_THREAD 0x30a02924
#define MCEXEC_UP_TERMINATE_THREAD 0x30a02925
@ -91,7 +91,10 @@ struct program_image_section {
struct get_cpu_set_arg {
int nr_processes;
char *req_cpu_list; // Requested by user-space
int req_cpu_list_len; // Lenght of request string
int *process_rank;
pid_t ppid;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;
@ -193,7 +196,6 @@ struct syscall_response {
unsigned long req_thread_status;
long ret;
unsigned long fault_address;
unsigned long fault_reason;
};
struct syscall_ret_desc {
@ -359,7 +361,7 @@ struct uti_get_ctx_desc {
unsigned long key; /* OUT: struct task_struct* of mcexec thread, used to search struct host_thread */
};
struct uti_save_fs_desc {
struct uti_switch_ctx_desc {
void *rctx; /* Remote context */
void *lctx; /* Local context */
};

View File

@ -4,6 +4,8 @@ if(ARCH STREQUAL "x86_64")
set(ARCH_C_FLAGS "-mno-red-zone -mcmodel=kernel")
endif()
set(MCEXEC_PATH "${CMAKE_INSTALL_FULL_BINDIR}/mcexec" CACHE STRING "mcexec path for binfmt")
kmod(mcctrl
C_FLAGS
-I${IHK_FULL_SOURCE_DIR}/linux/include
@ -16,7 +18,7 @@ kmod(mcctrl
-I${CMAKE_CURRENT_SOURCE_DIR}/arch/${ARCH}/include
-I${PROJECT_BINARY_DIR}
-I${PROJECT_SOURCE_DIR}/kernel/include
-DMCEXEC_PATH=\\"${CMAKE_INSTALL_FULL_BINDIR}/mcexec\\"
-DMCEXEC_PATH=\\"${MCEXEC_PATH}\\"
${ARCH_C_FLAGS}
SOURCES
driver.c control.c ikc.c syscall.c procfs.c binfmt_mcexec.c

View File

@ -1,7 +1,12 @@
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016 */
/* archdeps.c COPYRIGHT FUJITSU LIMITED 2016-2019 */
#include <linux/version.h>
#include <linux/mm_types.h>
#include <linux/kallsyms.h>
#if KERNEL_VERSION(4, 11, 0) <= LINUX_VERSION_CODE
#include <linux/sched/task_stack.h>
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) */
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/vdso.h>
#include "config.h"
#include "../../mcctrl.h"
@ -42,7 +47,6 @@ int arch_symbols_init(void)
}
#ifdef POSTK_DEBUG_ARCH_DEP_52
#define VDSO_MAXPAGES 1
struct vdso {
long busy;
@ -53,7 +57,6 @@ struct vdso {
long lbase;
long offset_sigtramp;
};
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
@ -95,6 +98,74 @@ reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp, unsign
return 0;
}
#if KERNEL_VERSION(4, 0, 0) <= LINUX_VERSION_CODE
static long elf_search_vdso_sigtramp(void)
{
int i = 0;
long ans = -1;
char *shstr = NULL, *dynstr = NULL;
Elf64_Ehdr *eh = NULL;
Elf64_Shdr *tmp_sh = NULL, *sym_sh = NULL;
Elf64_Sym *sym = NULL;
/* ELF header */
eh = (Elf64_Ehdr *)vdso_start;
if (eh == NULL) {
D("vdso_start is NULL.\n");
goto out;
}
/* ELF magic check */
if (eh->e_ident[EI_MAG0] != ELFMAG0 &&
eh->e_ident[EI_MAG1] != ELFMAG1 &&
eh->e_ident[EI_MAG2] != ELFMAG2 &&
eh->e_ident[EI_MAG3] != ELFMAG3) {
D("vdso_start ELF MAGIC Mismatch.\n"
"e_ident[EI_MAG0 - EI_MAG3]: %02x %02x %02x %02x\n",
eh->e_ident[EI_MAG0], eh->e_ident[EI_MAG1],
eh->e_ident[EI_MAG2], eh->e_ident[EI_MAG3]);
goto out;
}
/* Search dynsym-table and dynstr-table offset
* from section header table
*/
tmp_sh = (Elf64_Shdr *)(vdso_start + eh->e_shoff);
shstr = vdso_start + (tmp_sh + eh->e_shstrndx)->sh_offset;
for (i = 0; i < eh->e_shnum; i++, tmp_sh++) {
if (tmp_sh->sh_type == SHT_DYNSYM) {
sym_sh = tmp_sh;
}
if (tmp_sh->sh_type == SHT_STRTAB &&
!strcmp(&shstr[tmp_sh->sh_name], ".dynstr")) {
dynstr = vdso_start + tmp_sh->sh_offset;
}
}
if (sym_sh == NULL) {
D("dynsym-table not found.\n");
goto out;
}
if (dynstr == 0) {
D("dynstr-table not found.\n");
goto out;
}
/* Search __kernel_rt_sigreturn offset from dynsym-table */
sym = (Elf64_Sym *)(vdso_start + sym_sh->sh_offset);
for (i = 0; (i * sym_sh->sh_entsize) < sym_sh->sh_size; i++, sym++) {
if (!strcmp(dynstr + sym->st_name, "__kernel_rt_sigreturn")) {
ans = sym->st_value;
}
}
out:
return ans;
}
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
void get_vdso_info(ihk_os_t os, long vdso_rpa)
{
ihk_device_t dev = ihk_os_to_dev(os);
@ -128,7 +199,12 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
/* offsets */
vdso->lbase = VDSO_LBASE;
vdso->offset_sigtramp = vdso_offset_sigtramp;
vdso->offset_sigtramp = elf_search_vdso_sigtramp();
if (unlikely(vdso->offset_sigtramp == -1)) {
D("Use vdso_offset_sigtramp in header-file.\n");
vdso->offset_sigtramp = vdso_offset_sigtramp;
}
#endif /*LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)*/
out:
wmb();
@ -142,59 +218,61 @@ out:
void *
get_user_sp(void)
{
/* TODO; skeleton for UTI */
return NULL;
return (void *)current_pt_regs()->sp;
}
void
set_user_sp(void *usp)
{
/* TODO; skeleton for UTI */
current_pt_regs()->sp = (unsigned long)usp;
}
/* TODO; skeleton for UTI */
struct trans_uctx {
volatile int cond;
int fregsize;
unsigned long rax;
unsigned long rbx;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
unsigned long rbp;
unsigned long r8;
unsigned long r9;
unsigned long r10;
unsigned long r11;
unsigned long r12;
unsigned long r13;
unsigned long r14;
unsigned long r15;
unsigned long rflags;
unsigned long rip;
unsigned long rsp;
unsigned long fs;
struct user_pt_regs regs;
unsigned long tls_baseaddr;
};
void
restore_fs(unsigned long fs)
restore_tls(unsigned long addr)
{
/* TODO; skeleton for UTI */
const unsigned long tpidrro = 0;
asm volatile(
" msr tpidr_el0, %0\n"
" msr tpidrro_el0, %1"
: : "r" (addr), "r" (tpidrro));
}
void
save_fs_ctx(void *ctx)
save_tls_ctx(void __user *ctx)
{
/* TODO; skeleton for UTI */
struct trans_uctx __user *tctx = ctx;
unsigned long baseaddr;
asm volatile(
" mrs %0, tpidr_el0"
: "=r" (baseaddr));
if (copy_to_user(&tctx->tls_baseaddr, &baseaddr,
sizeof(tctx->tls_baseaddr))) {
pr_err("%s: copy_to_user failed.\n", __func__);
return;
}
}
unsigned long
get_fs_ctx(void *ctx)
get_tls_ctx(void __user *ctx)
{
/* TODO; skeleton for UTI */
return 0;
struct trans_uctx __user *tctx = ctx;
struct trans_uctx kctx;
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
pr_err("%s: copy_from_user failed.\n", __func__);
return 0;
}
return kctx.tls_baseaddr;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
@ -304,3 +382,38 @@ out:
error, rva, rpa, pgsize);
return error;
}
/*
* Assembler switch_ctx executes only ioctl.
* Context register save/load is done on Linux (get from current_pt_regs).
* Do TLS save/load and register host_thread with ioctl.
*/
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
{
int rc = 0;
struct trans_uctx *__user rctx = NULL;
struct trans_uctx *__user lctx = NULL;
struct trans_uctx klctx = {
.regs = current_pt_regs()->user_regs,
};
rctx = desc->rctx;
lctx = desc->lctx;
if (copy_to_user(lctx, &klctx, sizeof(klctx))) {
pr_err("%s: Error: copy_to_user failed\n", __func__);
rc = -EFAULT;
goto out;
}
if (copy_from_user(&current_pt_regs()->user_regs,
&rctx->regs, sizeof(rctx->regs))) {
pr_err("%s: Error: copy_from_user failed\n", __func__);
rc = -EFAULT;
goto out;
}
restore_tls(get_tls_ctx(rctx));
out:
return rc;
}

View File

@ -9,8 +9,6 @@
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
unsigned long *rpap, unsigned long *pgsizep);
#ifdef POSTK_DEBUG_ARCH_DEP_12
#define PFN_WRITE_COMBINED PTE_ATTRINDX(MT_NORMAL_NC)
static inline bool pte_is_write_combined(pte_t pte)
{
@ -31,9 +29,8 @@ static inline bool pte_is_write_combined(pte_t pte)
#endif
return ((pte_val(pte) & PTE_ATTRINDX_MASK) == PFN_WRITE_COMBINED);
}
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
#define ARMV8_IDX_COUNTER0 1
#define ARMV8_IDX_COUNTER0 0
#define ARCH_PERF_COUNTER_START ARMV8_IDX_COUNTER0
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)

View File

@ -2,9 +2,13 @@
#include <linux/version.h>
#include <linux/kallsyms.h>
#include <linux/uaccess.h>
#include <asm/vsyscall.h>
#include <asm/vgtod.h>
#include "config.h"
#include "../../mcctrl.h"
#define gtod (&VVAR(vsyscall_gtod_data))
//#define SC_DEBUG
#ifdef SC_DEBUG
@ -54,7 +58,6 @@ int arch_symbols_init(void)
}
#ifdef POSTK_DEBUG_ARCH_DEP_52
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
@ -70,8 +73,8 @@ struct vdso {
long hpet_phys;
void *pvti_virt;
long pvti_phys;
void *vgtod_virt;
};
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
unsigned long
reserve_user_space_common(struct mcctrl_usrdata *usrdata, unsigned long start, unsigned long end);
@ -207,6 +210,7 @@ void get_vdso_info(ihk_os_t os, long vdso_rpa)
#endif
}
vdso->vgtod_virt = (void *)gtod;
out:
wmb();
vdso->busy = 0;
@ -257,25 +261,35 @@ struct trans_uctx {
};
void
restore_fs(unsigned long fs)
restore_tls(unsigned long addr)
{
wrmsrl(MSR_FS_BASE, fs);
wrmsrl(MSR_FS_BASE, addr);
}
void
save_fs_ctx(void *ctx)
save_tls_ctx(void __user *ctx)
{
struct trans_uctx *tctx = ctx;
struct trans_uctx __user *tctx = ctx;
struct trans_uctx kctx;
rdmsrl(MSR_FS_BASE, tctx->fs);
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
pr_err("%s: copy_from_user failed.\n", __func__);
return;
}
rdmsrl(MSR_FS_BASE, kctx.fs);
}
unsigned long
get_fs_ctx(void *ctx)
get_tls_ctx(void __user *ctx)
{
struct trans_uctx *tctx = ctx;
struct trans_uctx __user *tctx = ctx;
struct trans_uctx kctx;
return tctx->fs;
if (copy_from_user(&kctx, tctx, sizeof(struct trans_uctx))) {
pr_err("%s: copy_from_user failed.\n", __func__);
return 0;
}
return kctx.fs;
}
unsigned long
@ -356,11 +370,17 @@ out:
return error;
}
#ifdef POSTK_DEBUG_ARCH_DEP_12
#define PFN_WRITE_COMBINED _PAGE_PWT
static inline bool pte_is_write_combined(pte_t pte)
{
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
}
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
/*
* The assembler switch_ctx is save/load registers in the context.
* Do TLS save/load and register host_thread with ioctl.
*/
long arch_switch_ctx(struct uti_switch_ctx_desc *desc)
{
return 0;
}

View File

@ -9,14 +9,12 @@
extern int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
unsigned long *rpap, unsigned long *pgsizep);
#ifdef POSTK_DEBUG_ARCH_DEP_12
#define PFN_WRITE_COMBINED _PAGE_PWT
static inline bool pte_is_write_combined(pte_t pte)
{
return ((pte_flags(pte) & _PAGE_PWT) && !(pte_flags(pte) & _PAGE_PCD));
}
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
#define ARCH_PERF_COUNTER_START 0

View File

@ -44,7 +44,6 @@
#include <config.h>
#include "mcctrl.h"
#include <ihk/ihk_host_user.h>
#include <ihklib_rusage.h>
#include <rusage.h>
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
#include <uapi/linux/sched/types.h>
@ -87,8 +86,19 @@ int syscall_backward(struct mcctrl_usrdata *, int, unsigned long, unsigned long,
unsigned long, unsigned long, unsigned long,
unsigned long, unsigned long *);
struct mcos_handler_info {
int pid;
int cpu;
struct mcctrl_usrdata *ud;
struct file *file;
unsigned long user_start;
unsigned long user_end;
unsigned long prepare_thread;
};
static long mcexec_prepare_image(ihk_os_t os,
struct program_load_desc * __user udesc)
struct program_load_desc * __user udesc,
struct file *file)
{
struct program_load_desc *desc = NULL;
struct program_load_desc *pdesc = NULL;
@ -100,6 +110,7 @@ static long mcexec_prepare_image(ihk_os_t os,
struct mcctrl_per_proc_data *ppd = NULL;
int num_sections;
int free_ikc_pointers = 1;
struct mcos_handler_info *info;
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
@ -122,6 +133,14 @@ static long mcexec_prepare_image(ihk_os_t os,
goto free_out;
}
info = ihk_os_get_mcos_private_data(file);
if (!info) {
ret = -EFAULT;
goto free_out;
}
/* To serialize SCD_MSG_SCHEDULE_PROCESS and SCD_MSG_CLEANUP_PROCESS */
info->cpu = desc->cpu;
ppd = mcctrl_get_per_proc_data(usrdata, desc->pid);
if (!ppd) {
printk("%s: ERROR: no per process data for PID %d\n",
@ -193,6 +212,11 @@ static long mcexec_prepare_image(ihk_os_t os,
/* either send or remote prepare_process failed */
goto put_and_free_out;
}
/*
* Used as SCD_MSG_CLEANUP_PROCESS target which isn't scheduled
* with SCD_MSG_SCHEDULE_PROCESS
*/
info->prepare_thread = pdesc->rprocess;
/* Update rpgtable */
ppd->rpgtable = pdesc->rpgtable;
@ -307,30 +331,10 @@ int mcexec_transfer_image(ihk_os_t os, struct remote_transfer *__user upt)
#endif
}
struct mcos_handler_info {
int pid;
int cpu;
struct mcctrl_usrdata *ud;
struct file *file;
unsigned long user_start;
unsigned long user_end;
};
struct mcos_handler_info;
static LIST_HEAD(host_threads); /* Used for FS switch */
DEFINE_RWLOCK(host_thread_lock);
/* Info of Linux counterpart of migrated-to-Linux thread */
struct host_thread {
struct list_head list;
struct mcos_handler_info *handler;
int pid;
int tid;
unsigned long usp;
unsigned long lfs;
unsigned long rfs;
};
struct mcos_handler_info *new_mcos_handler_info(ihk_os_t os, struct file *file)
{
struct mcos_handler_info *info;
@ -391,6 +395,7 @@ static void release_handler(ihk_os_t os, void *param)
memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_CLEANUP_PROCESS;
isp.pid = info->pid;
isp.arg = info->prepare_thread;
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
__FUNCTION__, info, info->cpu);
@ -426,6 +431,7 @@ static long mcexec_start_image(ihk_os_t os,
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcos_handler_info *info;
struct mcos_handler_info *prev_info;
int ret = 0;
if (!usrdata) {
@ -446,6 +452,7 @@ static long mcexec_start_image(ihk_os_t os,
goto out;
}
prev_info = ihk_os_get_mcos_private_data(file);
info = new_mcos_handler_info(os, file);
if (info == NULL) {
ret = -ENOMEM;
@ -456,6 +463,7 @@ static long mcexec_start_image(ihk_os_t os,
info->cpu = desc->cpu;
info->user_start = desc->user_start;
info->user_end = desc->user_end;
info->prepare_thread = prev_info->prepare_thread;
ihk_os_register_release_handler(file, release_handler, info);
ihk_os_set_mcos_private_data(file, info);
@ -472,8 +480,10 @@ static long mcexec_start_image(ihk_os_t os,
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
if (ret < 0) {
printk("%s: error: sending IKC msg\n", __FUNCTION__);
goto out;
}
/* clear prepared thread struct */
info->prepare_thread = 0;
out:
kfree(desc);
return ret;
@ -577,17 +587,14 @@ extern int mckernel_cpu_2_linux_cpu(struct mcctrl_usrdata *udp, int cpu_id);
static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct mcctrl_part_exec *pe;
struct mcctrl_part_exec *pe = NULL, *pe_itr;
struct get_cpu_set_arg req;
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *cpu_top, *cpu_top_i;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *cpu_top, *cpu_top_i;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
struct cache_topology *cache_top;
int cpu, cpus_assigned, cpus_to_assign, cpu_prev;
int ret = 0;
int mcexec_linux_numa;
int pe_list_len = 0;
cpumask_t *mcexec_cpu_set = NULL;
cpumask_t *cpus_used = NULL;
cpumask_t *cpus_to_use = NULL;
@ -607,24 +614,126 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
return -EINVAL;
}
pe = &udp->part_exec;
mutex_lock(&pe->lock);
if (copy_from_user(&req, (void *)arg, sizeof(req))) {
printk("%s: error copying user request\n", __FUNCTION__);
pr_err("%s: error copying user request\n", __func__);
ret = -EINVAL;
goto put_and_unlock_out;
goto put_out;
}
/* First process to enter CPU partitioning */
if (pe->nr_processes == -1) {
/* User requested CPU mask? */
if (req.req_cpu_list && req.req_cpu_list_len) {
char *cpu_list = NULL;
cpu_list = kmalloc(req.req_cpu_list_len, GFP_KERNEL);
if (!cpu_list) {
printk("%s: error: allocating CPU list\n", __FUNCTION__);
ret = -ENOMEM;
goto put_out;
}
if (copy_from_user(cpu_list,
req.req_cpu_list, req.req_cpu_list_len)) {
printk("%s: error copying CPU list request\n", __FUNCTION__);
kfree(cpu_list);
ret = -EINVAL;
goto put_out;
}
cpus_used = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
cpus_to_use = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
if (!cpus_to_use || !cpus_used) {
printk("%s: error: allocating CPU mask\n", __FUNCTION__);
ret = -ENOMEM;
kfree(cpu_list);
goto put_out;
}
memset(cpus_used, 0, sizeof(cpumask_t));
memset(cpus_to_use, 0, sizeof(cpumask_t));
/* Parse CPU list */
if (cpulist_parse(cpu_list, cpus_to_use) < 0) {
printk("%s: invalid CPUs requested: %s\n",
__FUNCTION__, cpu_list);
ret = -EINVAL;
kfree(cpu_list);
goto put_out;
}
memcpy(cpus_used, cpus_to_use, sizeof(cpumask_t));
/* Copy mask to user-space */
if (copy_to_user(req.cpu_set, cpus_used,
(req.cpu_set_size < sizeof(cpumask_t) ?
req.cpu_set_size : sizeof(cpumask_t)))) {
printk("%s: error copying mask to user\n", __FUNCTION__);
ret = -EINVAL;
kfree(cpu_list);
goto put_out;
}
/* Copy IKC target core */
cpu = cpumask_next(-1, cpus_used);
if (copy_to_user(req.target_core, &cpu, sizeof(cpu))) {
printk("%s: error copying target core to user\n",
__FUNCTION__);
ret = -EINVAL;
kfree(cpu_list);
goto put_out;
}
/* Save in per-process structure */
memcpy(&ppd->cpu_set, cpus_used, sizeof(cpumask_t));
ppd->ikc_target_cpu = cpu;
printk("%s: %s -> target McKernel CPU: %d\n",
__func__, cpu_list, cpu);
ret = 0;
kfree(cpu_list);
goto put_out;
}
mutex_lock(&udp->part_exec_lock);
/* Find part_exec having same node_proxy */
list_for_each_entry_reverse(pe_itr, &udp->part_exec_list, chain) {
pe_list_len++;
if (pe_itr->node_proxy_pid == req.ppid) {
pe = pe_itr;
break;
}
}
if (!pe) {
/* First process to enter CPU partitioning */
pr_debug("%s: pe_list_len:%d\n", __func__, pe_list_len);
if (pe_list_len >= PE_LIST_MAXLEN) {
/* delete head entry of pe_list */
pe_itr = list_first_entry(&udp->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe_itr->chain);
kfree(pe_itr);
}
pe = kzalloc(sizeof(struct mcctrl_part_exec), GFP_KERNEL);
if (!pe) {
mutex_unlock(&udp->part_exec_lock);
ret = -ENOMEM;
goto put_out;
}
/* Init part_exec */
mutex_init(&pe->lock);
INIT_LIST_HEAD(&pe->pli_list);
pe->nr_processes = req.nr_processes;
pe->nr_processes_left = req.nr_processes;
pe->nr_processes_joined = 0;
pe->node_proxy_pid = req.ppid;
list_add_tail(&pe->chain, &udp->part_exec_list);
dprintk("%s: nr_processes: %d (partitioned exec starts)\n",
__FUNCTION__,
pe->nr_processes);
__func__, pe->nr_processes);
}
mutex_unlock(&udp->part_exec_lock);
mutex_lock(&pe->lock);
if (pe->nr_processes != req.nr_processes) {
printk("%s: error: requested number of processes"
@ -634,7 +743,15 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
goto put_and_unlock_out;
}
if (pe->nr_processes_joined >= pe->nr_processes) {
printk("%s: too many processes have joined to the group of %d\n",
__func__, req.ppid);
ret = -EINVAL;
goto put_and_unlock_out;
}
--pe->nr_processes_left;
++pe->nr_processes_joined;
dprintk("%s: nr_processes: %d, nr_processes_left: %d\n",
__FUNCTION__,
pe->nr_processes,
@ -720,8 +837,6 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
wake_up_interruptible(&pli_next->pli_wq);
}
/* Reset process counter to start state */
pe->nr_processes = -1;
ret = -ETIMEDOUT;
goto put_and_unlock_out;
}
@ -969,16 +1084,8 @@ next_cpu:
/* Commit used cores to OS structure */
memcpy(&pe->cpus_used, cpus_used, sizeof(*cpus_used));
/* Reset if last process */
if (pe->nr_processes_left == 0) {
dprintk("%s: nr_processes: %d (partitioned exec ends)\n",
__FUNCTION__,
pe->nr_processes);
pe->nr_processes = -1;
memset(&pe->cpus_used, 0, sizeof(pe->cpus_used));
}
/* Otherwise wake up next process in list */
else {
/* If not last process, wake up next process in list */
if (pe->nr_processes_left != 0) {
++pe->process_rank;
pli_next = list_first_entry(&pe->pli_list,
struct process_list_item, list);
@ -991,11 +1098,14 @@ next_cpu:
ret = 0;
put_and_unlock_out:
mutex_unlock(&pe->lock);
put_out:
mcctrl_put_per_proc_data(ppd);
kfree(cpus_to_use);
kfree(cpus_used);
kfree(mcexec_cpu_set);
mcctrl_put_per_proc_data(ppd);
mutex_unlock(&pe->lock);
return ret;
}
@ -1199,7 +1309,7 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
ppd = mcctrl_get_per_proc_data(ud, pid);
if (unlikely(!ppd)) {
kprintf("%s: ERROR: no per-process structure for PID %d, "
dprintk("%s: ERROR: no per-process structure for PID %d, "
"syscall nr: %lu\n",
__FUNCTION__, pid, packet->req.number);
@ -1414,7 +1524,7 @@ retry_alloc:
__FUNCTION__, task_pid_vnr(current), packet->ref);
mb();
if (!packet->req.valid) {
if (!smp_load_acquire(&packet->req.valid)) {
printk("%s: ERROR: stray wakeup pid: %d, tid: %d: SC %lu\n",
__FUNCTION__,
task_tgid_vnr(current),
@ -1424,7 +1534,7 @@ retry_alloc:
goto retry;
}
packet->req.valid = 0; /* ack */
smp_store_release(&packet->req.valid, 0); /* ack */
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
__FUNCTION__,
@ -2376,7 +2486,7 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
{
struct mcctrl_ioctl_getrusage_desc desc;
struct rusage_global *rusage_global = ihk_os_get_rusage(ihk_os);
struct mckernel_rusage *rusage = NULL;
struct ihk_os_rusage *rusage = NULL;
int ret = 0;
int i;
unsigned long ut;
@ -2388,13 +2498,13 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
goto out;
}
rusage = kmalloc(sizeof(struct mckernel_rusage), GFP_KERNEL);
rusage = kmalloc(sizeof(struct ihk_os_rusage), GFP_KERNEL);
if (!rusage) {
printk("%s: kmalloc failed\n", __FUNCTION__);
ret = -ENOMEM;
goto out;
}
memset(rusage, 0, sizeof(struct mckernel_rusage));
memset(rusage, 0, sizeof(struct ihk_os_rusage));
/* Compile statistics */
for (i = 0; i < IHK_MAX_NUM_PGSIZES; i++) {
@ -2415,15 +2525,17 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
st += rusage_global->cpu[i].system_tsc * rusage_global->ns_per_tsc / 1000;
rusage->cpuacct_usage_percpu[i] = wt;
}
rusage->cpuacct_stat_system = st / 10000000;
rusage->cpuacct_stat_user = ut / 10000000;
rusage->cpuacct_stat_system = (st + 10000000 - 1) / 10000000;
rusage->cpuacct_stat_user = (ut + 10000000 - 1) / 10000000;
rusage->cpuacct_usage = ut;
rusage->num_threads = rusage_global->num_threads;
rusage->max_num_threads = rusage_global->max_num_threads;
if (desc.size_rusage > sizeof(struct mckernel_rusage)) {
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n", __FUNCTION__, desc.size_rusage, sizeof(struct mckernel_rusage));
if (desc.size_rusage > sizeof(struct ihk_os_rusage)) {
printk("%s: desc.size_rusage=%ld > sizeof(struct mckernel_rusage)=%ld\n",
__func__, desc.size_rusage,
sizeof(struct ihk_os_rusage));
ret = -EINVAL;
goto out;
}
@ -2444,10 +2556,10 @@ long mcctrl_getrusage(ihk_os_t ihk_os, struct mcctrl_ioctl_getrusage_desc *__use
extern void *get_user_sp(void);
extern void set_user_sp(unsigned long);
extern void restore_fs(unsigned long fs);
extern void save_fs_ctx(void *);
extern unsigned long get_fs_ctx(void *);
extern unsigned long get_rsp_ctx(void *);
extern void restore_tls(unsigned long addr);
extern void save_tls_ctx(void __user *ctx);
extern unsigned long get_tls_ctx(void __user *ctx);
extern unsigned long get_rsp_ctx(void *ctx);
long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
{
@ -2491,14 +2603,15 @@ long mcexec_uti_get_ctx(ihk_os_t os, struct uti_get_ctx_desc __user *udesc)
return rc;
}
long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, struct file *file)
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *udesc,
struct file *file)
{
int rc = 0;
void *usp = get_user_sp();
struct mcos_handler_info *info;
struct host_thread *thread;
unsigned long flags;
struct uti_save_fs_desc desc;
struct uti_switch_ctx_desc desc;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd;
@ -2508,21 +2621,26 @@ long mcexec_uti_save_fs(ihk_os_t os, struct uti_save_fs_desc __user *udesc, stru
goto out;
}
if(copy_from_user(&desc, udesc, sizeof(struct uti_save_fs_desc))) {
if (copy_from_user(&desc, udesc, sizeof(struct uti_switch_ctx_desc))) {
printk("%s: Error: copy_from_user failed\n", __FUNCTION__);
rc = -EFAULT;
goto out;
}
save_fs_ctx(desc.lctx);
rc = arch_switch_ctx(&desc);
if (rc < 0) {
goto out;
}
save_tls_ctx(desc.lctx);
info = ihk_os_get_mcos_private_data(file);
thread = kmalloc(sizeof(struct host_thread), GFP_KERNEL);
memset(thread, '\0', sizeof(struct host_thread));
thread->pid = task_tgid_vnr(current);
thread->tid = task_pid_vnr(current);
thread->usp = (unsigned long)usp;
thread->lfs = get_fs_ctx(desc.lctx);
thread->rfs = get_fs_ctx(desc.rctx);
thread->ltls = get_tls_ctx(desc.lctx);
thread->rtls = get_tls_ctx(desc.rctx);
thread->handler = info;
write_lock_irqsave(&host_thread_lock, flags);
@ -2568,9 +2686,9 @@ mcexec_sig_thread(ihk_os_t os, unsigned long arg, struct file *file)
read_unlock_irqrestore(&host_thread_lock, flags);
if (thread) {
if (arg)
restore_fs(thread->lfs);
restore_tls(thread->ltls);
else
restore_fs(thread->rfs);
restore_tls(thread->rtls);
goto out;
}
ret = -EINVAL;
@ -2774,8 +2892,7 @@ long mcexec_syscall_thread(ihk_os_t os, unsigned long arg, struct file *file)
return -EFAULT;
}
/* debug */
if (0 && param.number == __NR_futex) {
if (param.number == __NR_futex) {
struct uti_futex_resp resp = {
.done = 0
};
@ -2971,13 +3088,8 @@ mcexec_uti_attr(ihk_os_t os, struct uti_attr_desc __user *_desc)
cpumask_t *cpuset = NULL, *env_cpuset = NULL;
struct mcctrl_usrdata *ud = ihk_host_os_get_usrdata(os);
ihk_device_t dev = ihk_os_to_dev(os);
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *cpu_topo;
struct mcctrl_cpu_topology *target_cpu = NULL;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *cpu_topo;
struct cpu_topology *target_cpu = NULL;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
struct node_topology *node_topo;
struct ihk_cache_topology *lcache_topo;
struct ihk_node_topology *lnode_topo;
@ -3200,13 +3312,51 @@ out:
return rc;
}
static int __mcctrl_control_perm(unsigned int request)
{
int ret = 0;
kuid_t euid;
/* black list */
switch (request) {
case IHK_OS_AUX_PERF_NUM:
case IHK_OS_AUX_PERF_SET:
case IHK_OS_AUX_PERF_GET:
case IHK_OS_AUX_PERF_ENABLE:
case IHK_OS_AUX_PERF_DISABLE:
case IHK_OS_AUX_PERF_DESTROY:
euid = current_euid();
pr_debug("%s: request=0x%x, euid=%u\n",
__func__, request, euid.val);
if (euid.val) {
ret = -EPERM;
}
break;
default:
break;
}
pr_debug("%s: request=0x%x, ret=%d\n", __func__, request, ret);
return ret;
}
long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
struct file *file)
{
int ret;
ret = __mcctrl_control_perm(req);
if (ret) {
pr_err("%s: error: permission denied, req: %x\n",
__func__, req);
return ret;
}
switch (req) {
case MCEXEC_UP_PREPARE_IMAGE:
return mcexec_prepare_image(os,
(struct program_load_desc *)arg);
(struct program_load_desc *)arg,
file);
case MCEXEC_UP_TRANSFER:
return mcexec_transfer_image(os, (struct remote_transfer *)arg);
@ -3272,8 +3422,9 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
case MCEXEC_UP_UTI_GET_CTX:
return mcexec_uti_get_ctx(os, (struct uti_get_ctx_desc *)arg);
case MCEXEC_UP_UTI_SAVE_FS:
return mcexec_uti_save_fs(os, (struct uti_save_fs_desc *)arg, file);
case MCEXEC_UP_UTI_SWITCH_CTX:
return mcctrl_switch_ctx(os, (struct uti_switch_ctx_desc *)arg,
file);
case MCEXEC_UP_SIG_THREAD:
return mcexec_sig_thread(os, arg, file);
@ -3320,14 +3471,6 @@ long __mcctrl_control(ihk_os_t os, unsigned int req, unsigned long arg,
return -EINVAL;
}
/* Per-CPU register manipulation functions */
struct mcctrl_os_cpu_response {
int done;
unsigned long val;
int err;
wait_queue_head_t wq;
};
int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
{
struct mcctrl_usrdata *usrdata;
@ -3379,7 +3522,8 @@ int mcctrl_get_request_os_cpu(ihk_os_t os, int *ret_cpu)
*ret_cpu = ch->send.queue->read_cpu;
ret = 0;
printk("%s: OS: %p, CPU: %d\n", __FUNCTION__, os, *ret_cpu);
pr_info("%s: OS: %lx, CPU: %d\n",
__func__, (unsigned long)os, *ret_cpu);
out_put_ppd:
mcctrl_put_per_thread_data(ptd);
@ -3390,73 +3534,67 @@ out_put_ppd:
return ret;
}
void mcctrl_os_read_write_cpu_response(ihk_os_t os,
struct ikc_scd_packet *pisp)
{
struct mcctrl_os_cpu_response *resp;
/* XXX: What if caller thread is unblocked by a signal
* before this message arrives? */
resp = pisp->resp;
if (!resp) {
return;
}
resp->val = pisp->desc.val;
resp->done = 1;
resp->err = pisp->err;
wake_up_interruptible(&resp->wq);
}
int __mcctrl_os_read_write_cpu_register(ihk_os_t os, int cpu,
struct ihk_os_cpu_register *desc,
enum mcctrl_os_cpu_operation op)
{
struct mcctrl_usrdata *udp = ihk_host_os_get_usrdata(os);
struct ikc_scd_packet isp;
struct mcctrl_os_cpu_response resp;
struct ihk_os_cpu_register *ldesc = NULL;
int do_free = 0;
int ret = -EINVAL;
if (!udp) {
pr_err("%s: error: mcctrl_usrdata not found\n", __func__);
ret = -EINVAL;
goto out;
}
if (cpu < 0 || cpu >= udp->cpu_info->n_cpus) {
pr_err("%s: error: cpu (%d) is out of range\n",
__func__, cpu);
ret = -EINVAL;
goto out;
}
/* Keep a dynamic structure around that can
* survive an early return due to a signal */
ldesc = kmalloc(sizeof(*ldesc), GFP_KERNEL);
if (!ldesc) {
printk("%s: ERROR: allocating cpu register desc\n", __FUNCTION__);
return -ENOMEM;
}
*ldesc = *desc;
memset(&isp, '\0', sizeof(struct ikc_scd_packet));
isp.msg = SCD_MSG_CPU_RW_REG;
isp.op = op;
isp.desc = *desc;
isp.resp = &resp;
isp.pdesc = virt_to_phys(ldesc);
resp.done = 0;
resp.err = 0;
init_waitqueue_head(&resp.wq);
mb();
ret = mcctrl_ikc_send(os, cpu, &isp);
if (ret < 0) {
ret = mcctrl_ikc_send_wait(os, cpu, &isp, 0, NULL, &do_free, 1, ldesc);
if (ret != 0) {
printk("%s: ERROR sending IKC msg: %d\n", __FUNCTION__, ret);
goto out;
}
/* Wait for response */
ret = wait_event_interruptible(resp.wq, resp.done);
if (ret < 0) {
printk("%s: ERROR after wait: %d\n", __FUNCTION__, ret);
goto out;
}
ret = resp.err;
if (ret != 0) {
printk("%s: ERROR receive: %d\n", __FUNCTION__, resp.err);
goto out;
}
/* Update if read */
if (ret == 0 && op == MCCTRL_OS_CPU_READ_REGISTER) {
desc->val = resp.val;
if (op == MCCTRL_OS_CPU_READ_REGISTER) {
desc->val = ldesc->val;
}
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: reg: 0x%lx, val: 0x%lx\n",
/* Notify caller (for future async implementation) */
atomic_set(&desc->sync, 1);
dprintk("%s: MCCTRL_OS_CPU_%s_REGISTER: CPU: %d, addr_ext: 0x%lx, val: 0x%lx\n",
__FUNCTION__,
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"),
desc->addr, desc->val);
(op == MCCTRL_OS_CPU_READ_REGISTER ? "READ" : "WRITE"), cpu,
desc->addr_ext, desc->val);
out:
if (do_free) {
kfree(ldesc);
}
return ret;
}

View File

@ -21,8 +21,10 @@
* 2013/08/19 shirasawa mcexec forward signal to MIC process
*/
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/miscdevice.h>
#include <linux/slab.h>
@ -80,11 +82,13 @@ static struct ihk_os_user_call_handler mcctrl_uchs[] = {
{ .request = MCEXEC_UP_CLOSE_EXEC, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CRED, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_GET_CREDV, .func = mcctrl_ioctl },
#ifdef MCEXEC_BIND_MOUNT
{ .request = MCEXEC_UP_SYS_MOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UMOUNT, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYS_UNSHARE, .func = mcctrl_ioctl },
#endif // MCEXEC_BIND_MOUNT
{ .request = MCEXEC_UP_UTI_GET_CTX, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_UTI_SAVE_FS, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_UTI_SWITCH_CTX, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SIG_THREAD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_SYSCALL_THREAD, .func = mcctrl_ioctl },
{ .request = MCEXEC_UP_TERMINATE_THREAD, .func = mcctrl_ioctl },
@ -170,14 +174,6 @@ error_cleanup_channels:
int mcctrl_os_shutdown_notifier(int os_index)
{
if (os[os_index]) {
/* Wait for os running */
if (ihk_os_wait_for_status(os[os_index], IHK_OS_STATUS_RUNNING, 0, 200) != 0) {
printk("IHK: OS does not become RUNNING in shutdown. Force shutdown.\n");
/* send nmi to force shutdown */
ihk_os_send_nmi(os[os_index], 3);
mdelay(200);
}
pager_cleanup();
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);

View File

@ -208,6 +208,8 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
case SCD_MSG_PERF_ACK:
case SCD_MSG_SEND_SIGNAL_ACK:
case SCD_MSG_PROCFS_ANSWER:
case SCD_MSG_REMOTE_PAGE_FAULT_ANSWER:
case SCD_MSG_CPU_RW_REG_RESP:
mcctrl_wakeup_cb(__os, pisp);
break;
@ -238,10 +240,6 @@ static int syscall_packet_handler(struct ihk_ikc_channel_desc *c,
get_vdso_info(__os, pisp->arg);
break;
case SCD_MSG_CPU_RW_REG_RESP:
mcctrl_os_read_write_cpu_response(__os, pisp);
break;
case SCD_MSG_EVENTFD:
dkprintf("%s: SCD_MSG_EVENTFD,pisp->eventfd_type=%d\n", __FUNCTION__, pisp->eventfd_type);
mcctrl_eventfd(__os, pisp);
@ -464,7 +462,7 @@ int prepare_ikc_channels(ihk_os_t os)
int i;
int ret = 0;
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_KERNEL);
usrdata = kzalloc(sizeof(struct mcctrl_usrdata), GFP_ATOMIC);
if (!usrdata) {
printk("%s: error: allocating mcctrl_usrdata\n", __FUNCTION__);
ret = -ENOMEM;
@ -490,7 +488,7 @@ int prepare_ikc_channels(ihk_os_t os)
usrdata->num_channels = usrdata->cpu_info->n_cpus;
usrdata->channels = kzalloc(sizeof(struct mcctrl_channel) *
usrdata->num_channels,
GFP_KERNEL);
GFP_ATOMIC);
if (!usrdata->channels) {
printk("Error: cannot allocate channels.\n");
@ -499,7 +497,7 @@ int prepare_ikc_channels(ihk_os_t os)
}
usrdata->ikc2linux = kzalloc(sizeof(struct ihk_ikc_channel_desc *) *
nr_cpu_ids, GFP_KERNEL);
nr_cpu_ids, GFP_ATOMIC);
if (!usrdata->ikc2linux) {
printk("Error: cannot allocate ikc2linux channels.\n");
@ -515,6 +513,7 @@ int prepare_ikc_channels(ihk_os_t os)
init_waitqueue_head(&usrdata->wq_procfs);
mutex_init(&usrdata->reserve_lock);
mutex_init(&usrdata->part_exec_lock);
for (i = 0; i < MCCTRL_PER_PROC_DATA_HASH_SIZE; ++i) {
INIT_LIST_HEAD(&usrdata->per_proc_data_hash[i]);
@ -523,10 +522,8 @@ int prepare_ikc_channels(ihk_os_t os)
INIT_LIST_HEAD(&usrdata->cpu_topology_list);
INIT_LIST_HEAD(&usrdata->node_topology_list);
INIT_LIST_HEAD(&usrdata->part_exec_list);
mutex_init(&usrdata->part_exec.lock);
INIT_LIST_HEAD(&usrdata->part_exec.pli_list);
usrdata->part_exec.nr_processes = -1;
INIT_LIST_HEAD(&usrdata->wakeup_descs_list);
spin_lock_init(&usrdata->wakeup_descs_lock);
@ -582,6 +579,18 @@ void destroy_ikc_channels(ihk_os_t os)
kfree(usrdata->channels);
kfree(usrdata->ikc2linux);
mutex_lock(&usrdata->part_exec_lock);
while (!list_empty(&usrdata->part_exec_list)) {
struct mcctrl_part_exec *pe;
pe = list_first_entry(&usrdata->part_exec_list,
struct mcctrl_part_exec, chain);
list_del(&pe->chain);
kfree(pe);
}
mutex_unlock(&usrdata->part_exec_lock);
kfree(usrdata);
}

View File

@ -69,6 +69,9 @@
#define SCD_MSG_PROCFS_ANSWER 0x13
#define SCD_MSG_PROCFS_RELEASE 0x15
#define SCD_MSG_REMOTE_PAGE_FAULT 0x18
#define SCD_MSG_REMOTE_PAGE_FAULT_ANSWER 0x19
#define SCD_MSG_DEBUG_LOG 0x20
#define SCD_MSG_SYSFS_REQ_CREATE 0x30
@ -121,12 +124,6 @@ enum mcctrl_os_cpu_operation {
MCCTRL_OS_CPU_MAX_OP
};
/* Used to wake-up a Linux thread futex_wait()-ing */
struct uti_futex_resp {
int done;
wait_queue_head_t wq;
};
struct ikc_scd_packet {
struct ihk_ikc_packet_header header;
int msg;
@ -157,7 +154,7 @@ struct ikc_scd_packet {
/* SCD_MSG_CPU_RW_REG */
struct {
struct ihk_os_cpu_register desc;
unsigned long pdesc; /* Physical addr of the descriptor */
enum mcctrl_os_cpu_operation op;
void *resp;
};
@ -172,6 +169,14 @@ struct ikc_scd_packet {
void *resp;
int *spin_sleep; /* 1: waiting in linux_wait_event() 0: woken up by someone else */
} futex;
/* SCD_MSG_REMOTE_PAGE_FAULT */
struct {
int target_cpu;
int fault_tid;
unsigned long fault_address;
unsigned long fault_reason;
};
};
/* char padding[8]; */ /* We want the size to be 128 bytes */
};
@ -289,11 +294,7 @@ struct cache_topology {
struct list_head chain;
};
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology {
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology {
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
//struct mcctrl_usrdata *udp;
struct ihk_cpu_topology *saved;
int mckernel_cpu_id;
@ -323,13 +324,20 @@ struct process_list_item {
wait_queue_head_t pli_wq;
};
#define PE_LIST_MAXLEN 5
struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
/* number of processes to let in / out the synchronization point */
int nr_processes_left;
/* number of processes which have joined the partition */
int nr_processes_joined;
int process_rank;
pid_t node_proxy_pid;
cpumask_t cpus_used;
struct list_head pli_list;
struct list_head chain;
};
#define CPU_LONGS (((NR_CPUS) + (BITS_PER_LONG) - 1) / (BITS_PER_LONG))
@ -352,6 +360,7 @@ struct mcctrl_usrdata {
int job_pos;
int mcctrl_dma_abort;
struct mutex reserve_lock;
struct mutex part_exec_lock;
unsigned long last_thread_exec;
wait_queue_head_t wq_procfs;
struct list_head per_proc_data_hash[MCCTRL_PER_PROC_DATA_HASH_SIZE];
@ -367,7 +376,7 @@ struct mcctrl_usrdata {
nodemask_t numa_online;
struct list_head cpu_topology_list;
struct list_head node_topology_list;
struct mcctrl_part_exec part_exec;
struct list_head part_exec_list;
int perf_event_num;
};
@ -448,40 +457,8 @@ void mcctrl_put_per_proc_data(struct mcctrl_per_proc_data *ppd);
int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data);
void mcctrl_put_per_thread_data_unsafe(struct mcctrl_per_thread_data *ptd);
void mcctrl_put_per_thread_data(struct mcctrl_per_thread_data* ptd);
#ifdef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
static inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
struct task_struct *task)
{
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
unsigned long flags;
/* Check if data for this thread exists */
write_lock_irqsave(&ppd->per_thread_data_hash_lock[hash], flags);
list_for_each_entry(ptd_iter, &ppd->per_thread_data_hash[hash], hash) {
if (ptd_iter->task == task) {
ptd = ptd_iter;
break;
}
}
if (ptd) {
if (atomic_read(&ptd->refcount) <= 0) {
printk("%s: ERROR: use-after-free detected (%d)", __FUNCTION__, atomic_read(&ptd->refcount));
ptd = NULL;
goto out;
}
atomic_inc(&ptd->refcount);
}
out:
write_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
return ptd;
}
#else /* POSTK_DEBUG_ARCH_DEP_56 */
inline struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task);
#endif /* POSTK_DEBUG_ARCH_DEP_56 */
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
struct task_struct *task);
int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len);
void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
@ -526,24 +503,6 @@ void reply_get_cpu_mapping(long req_pa);
void free_topology_info(ihk_os_t os);
/* archdep.c */
#ifndef POSTK_DEBUG_ARCH_DEP_52
#define VDSO_MAXPAGES 2
struct vdso {
long busy;
int vdso_npages;
char vvar_is_global;
char hpet_is_global;
char pvti_is_global;
char padding;
long vdso_physlist[VDSO_MAXPAGES];
void *vvar_virt;
long vvar_phys;
void *hpet_virt;
long hpet_phys;
void *pvti_virt;
long pvti_phys;
};
#endif /*POSTK_DEBUG_ARCH_DEP_52*/
int reserve_user_space(struct mcctrl_usrdata *usrdata, unsigned long *startp,
unsigned long *endp);
@ -573,8 +532,28 @@ struct ihk_perf_event_attr{
};
struct mcctrl_ioctl_getrusage_desc {
void* rusage;
struct ihk_os_rusage *rusage;
size_t size_rusage;
};
/* uti */
long mcctrl_switch_ctx(ihk_os_t os, struct uti_switch_ctx_desc __user *desc,
struct file *file);
long arch_switch_ctx(struct uti_switch_ctx_desc *desc);
struct host_thread {
struct list_head list;
struct mcos_handler_info *handler;
int pid;
int tid;
unsigned long usp;
unsigned long ltls;
unsigned long rtls;
};
/* Used to wake-up a Linux thread futex_wait()-ing */
struct uti_futex_resp {
int done;
wait_queue_head_t wq;
};
#endif

View File

@ -1110,10 +1110,10 @@ static const struct procfs_entry pid_entry_stuff[] = {
// PROC_LNK("exe", mckernel_readlink),
// PROC_REG("limits", S_IRUSR|S_IWUSR, NULL),
PROC_REG("maps", 0444, &mckernel_buff_io),
PROC_REG("mem", 0400, NULL),
PROC_REG("mem", 0600, NULL),
PROC_REG("pagemap", 0444, NULL),
// PROC_REG("smaps", S_IRUGO, NULL),
// PROC_REG("stat", 0444, &mckernel_buff_io),
PROC_REG("stat", 0444, &mckernel_buff_io),
// PROC_REG("statm", S_IRUGO, NULL),
PROC_REG("status", 0444, &mckernel_buff_io),
// PROC_REG("syscall", S_IRUGO, NULL),

View File

@ -43,6 +43,8 @@
#include <linux/semaphore.h>
#include <linux/spinlock.h>
#include <linux/mount.h>
#include <linux/kdev_t.h>
#include <linux/hugetlb.h>
#include <asm/uaccess.h>
#include <asm/delay.h>
#include <asm/io.h>
@ -178,8 +180,8 @@ int mcctrl_add_per_thread_data(struct mcctrl_per_proc_data *ppd, void *data)
return ret;
}
#ifndef POSTK_DEBUG_ARCH_DEP_56 /* Strange how to use inline declaration fix. */
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd, struct task_struct *task)
struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc_data *ppd,
struct task_struct *task)
{
struct mcctrl_per_thread_data *ptd_iter, *ptd = NULL;
int hash = (((uint64_t)task >> 4) & MCCTRL_PER_THREAD_DATA_HASH_MASK);
@ -208,7 +210,6 @@ struct mcctrl_per_thread_data *mcctrl_get_per_thread_data(struct mcctrl_per_proc
read_unlock_irqrestore(&ppd->per_thread_data_hash_lock[hash], flags);
return ptd;
}
#endif /* !POSTK_DEBUG_ARCH_DEP_56 */
static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet,
struct syscall_response *res)
@ -227,19 +228,19 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
c = (usrdata->channels + packet->ref)->c;
/* If spinning, no need for IKC message */
if (__sync_bool_compare_and_swap(&res->req_thread_status,
if (cmpxchg(&res->req_thread_status,
IHK_SCD_REQ_THREAD_SPINNING,
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) ==
IHK_SCD_REQ_THREAD_SPINNING) {
dprintk("%s: no need to send IKC message for PID %d\n",
__FUNCTION__, packet->pid);
__FUNCTION__, packet->pid);
return ret;
}
/* Wait until the status goes back to IHK_SCD_REQ_THREAD_SPINNING or
IHK_SCD_REQ_THREAD_DESCHEDULED because two wake-up attempts are competing.
Note that mcexec_terminate_thread() and remote page fault and
returning EINTR would compete. */
if (res->req_thread_status == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
Note that mcexec_terminate_thread() and returning EINTR would compete. */
if (smp_load_acquire(&res->req_thread_status) == IHK_SCD_REQ_THREAD_TO_BE_WOKEN) {
printk("%s: INFO: someone else is waking up the McKernel thread, "
"pid: %d, req status: %lu, syscall nr: %lu\n",
__FUNCTION__, packet->pid,
@ -247,9 +248,10 @@ static int __notify_syscall_requester(ihk_os_t os, struct ikc_scd_packet *packet
}
/* The thread is not spinning any more, make sure it's descheduled */
if (!__sync_bool_compare_and_swap(&res->req_thread_status,
if (cmpxchg(&res->req_thread_status,
IHK_SCD_REQ_THREAD_DESCHEDULED,
IHK_SCD_REQ_THREAD_TO_BE_WOKEN)) {
IHK_SCD_REQ_THREAD_TO_BE_WOKEN) !=
IHK_SCD_REQ_THREAD_DESCHEDULED) {
printk("%s: WARNING: inconsistent requester status, "
"pid: %d, req status: %lu, syscall nr: %lu\n",
__FUNCTION__, packet->pid,
@ -273,7 +275,7 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
unsigned long *ret)
{
struct ikc_scd_packet *packet;
struct ikc_scd_packet *free_packet = NULL;
struct ikc_scd_packet *free_packet = NULL;
struct syscall_request *req;
struct syscall_response *resp;
unsigned long syscall_ret;
@ -282,15 +284,16 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
struct mcctrl_per_proc_data *ppd;
struct mcctrl_per_thread_data *ptd;
unsigned long phys;
struct syscall_request _request[2];
struct syscall_request *request;
struct syscall_request *request = NULL;
int retry;
if (((unsigned long)_request ^ (unsigned long)(_request + 1)) &
~(PAGE_SIZE -1))
request = _request + 1;
else
request = _request;
request = kmalloc(sizeof(struct syscall_request), GFP_ATOMIC);
if (!request) {
printk("%s: ERROR: allocating request\n", __func__);
syscall_ret = -ENOMEM;
goto no_ppd;
}
request->number = num;
request->args[0] = arg1;
request->args[1] = arg2;
@ -305,8 +308,9 @@ long syscall_backward(struct mcctrl_usrdata *usrdata, int num,
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
__func__, task_tgid_vnr(current));
syscall_ret = -EINVAL;
goto no_ppd;
}
ptd = mcctrl_get_per_thread_data(ppd, current);
@ -454,11 +458,13 @@ out:
out_put_ppd:
mcctrl_put_per_thread_data(ptd);
pr_ptd("put", task_pid_vnr(current), ptd);
no_ptd:
no_ptd:
dprintk("%s: tid: %d, syscall: %d, syscall_ret: %lx\n",
__FUNCTION__, task_pid_vnr(current), num, syscall_ret);
mcctrl_put_per_proc_data(ppd);
no_ppd:
kfree(request);
return syscall_ret;
}
@ -479,214 +485,43 @@ extern struct host_thread *host_threads;
extern rwlock_t host_thread_lock;
#endif
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr, uint64_t reason)
int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
uint64_t reason, struct mcctrl_per_proc_data *ppd,
struct ikc_scd_packet *packet)
{
struct ikc_scd_packet *packet;
struct ikc_scd_packet *free_packet = NULL;
struct syscall_request *req;
struct syscall_response *resp;
int error;
struct wait_queue_head_list_node *wqhln;
unsigned long irqflags;
struct mcctrl_per_proc_data *ppd;
struct mcctrl_per_thread_data *ptd;
unsigned long phys;
int retry;
struct mcctrl_wakeup_desc *desc;
int do_frees = 1;
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
return -EINVAL;
}
ptd = mcctrl_get_per_thread_data(ppd, current);
if (!ptd) {
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
error = -ENOENT;
goto no_ptd;
}
pr_ptd("get", task_pid_vnr(current), ptd);
packet = (struct ikc_scd_packet *)ptd->data;
if (!packet) {
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
error = -ENOENT;
goto out_put_ppd;
}
req = &packet->req;
/* Map response structure */
phys = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys, sizeof(*resp), NULL, 0);
if (!resp) {
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
goto out;
}
retry_alloc:
wqhln = kmalloc(sizeof(*wqhln), GFP_ATOMIC);
if (!wqhln) {
printk("WARNING: coudln't alloc wait queue head, retrying..\n");
goto retry_alloc;
}
memset(wqhln, 0, sizeof(struct wait_queue_head_list_node));
/* Prepare per-thread wait queue head */
wqhln->task = current;
/* Save the TID explicitly, because mcexec_syscall(), where the request
* will be matched, is in IRQ context and can't call task_pid_vnr() */
wqhln->rtid = task_pid_vnr(current);
wqhln->req = 0;
init_waitqueue_head(&wqhln->wq_syscall);
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
/* Add to exact list */
list_add_tail(&wqhln->list, &ppd->wq_list_exact);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
/* Request page fault */
resp->ret = -EFAULT;
resp->fault_address = (unsigned long)fault_addr;
resp->fault_reason = reason;
resp->stid = task_pid_vnr(current);
packet->msg = SCD_MSG_REMOTE_PAGE_FAULT;
packet->fault_address = (unsigned long)fault_addr;
packet->fault_reason = reason;
#define STATUS_PAGER_COMPLETED 1
#define STATUS_PAGE_FAULT 3
req->valid = 0;
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
/* we need to alloc desc ourselves because GFP_ATOMIC */
retry_alloc:
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
if (!desc) {
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
goto retry_alloc;
}
mb();
resp->status = STATUS_PAGE_FAULT;
retry = 0;
for (;;) {
dprintk("%s: tid: %d, fault_addr: %p SLEEPING\n",
__FUNCTION__, task_pid_vnr(current), fault_addr);
/* wait for response */
error = wait_event_interruptible(wqhln->wq_syscall, wqhln->req);
/* Delay signal handling */
if (error == -ERESTARTSYS) {
printk("%s: INFO: interrupted by signal\n", __FUNCTION__);
retry++;
if (retry < 5) { /* mcexec is alive */
printk("%s: INFO: retry=%d\n", __FUNCTION__, retry);
continue;
}
}
/* Remove per-thread wait queue head */
irqflags = ihk_ikc_spinlock_lock(&ppd->wq_list_lock);
list_del(&wqhln->list);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, irqflags);
dprintk("%s: tid: %d, fault_addr: %p WOKEN UP\n",
__FUNCTION__, task_pid_vnr(current), fault_addr);
if (retry >= 5) {
kfree(wqhln);
kprintf("%s: INFO: mcexec is gone or retry count exceeded,pid=%d,retry=%d\n", __FUNCTION__, task_tgid_vnr(current), retry);
error = -EINVAL;
goto out;
}
if (error) {
kfree(wqhln);
printk("remote_page_fault:interrupted. %d\n", error);
goto out;
}
else {
/* Update packet reference */
packet = wqhln->packet;
free_packet = packet;
req = &packet->req;
{
unsigned long phys2;
struct syscall_response *resp2;
phys2 = ihk_device_map_memory(ihk_os_to_dev(usrdata->os),
packet->resp_pa, sizeof(*resp));
resp2 = ihk_device_map_virtual(ihk_os_to_dev(usrdata->os),
phys2, sizeof(*resp), NULL, 0);
if (resp != resp2) {
resp = resp2;
phys = phys2;
printk("%s: updated new remote PA for resp\n", __FUNCTION__);
}
}
}
if (!req->valid) {
printk("remote_page_fault:not valid\n");
}
req->valid = 0;
/* check result */
if (req->number != __NR_mmap) {
printk("remote_page_fault:unexpected response. %lx %lx\n",
req->number, req->args[0]);
error = -EIO;
goto out;
}
#define PAGER_REQ_RESUME 0x0101
else if (req->args[0] != PAGER_REQ_RESUME) {
resp->ret = pager_call(usrdata->os, (void *)req);
if (__notify_syscall_requester(usrdata->os, packet, resp) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",
__FUNCTION__, packet->pid);
}
mb();
resp->status = STATUS_PAGER_COMPLETED;
break;
//continue;
}
else {
error = req->args[1];
if (error) {
printk("remote_page_fault:response %d\n", error);
kfree(wqhln);
goto out;
}
}
break;
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet,
0, desc, &do_frees, 0);
if (do_frees)
kfree(desc);
if (error < 0) {
pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n",
__func__, packet->pid, error);
}
kfree(wqhln);
error = 0;
out:
/* Release remote page-fault response packet */
if (free_packet) {
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)free_packet);
}
ihk_device_unmap_virtual(ihk_os_to_dev(usrdata->os), resp, sizeof(*resp));
ihk_device_unmap_memory(ihk_os_to_dev(usrdata->os), phys, sizeof(*resp));
out_put_ppd:
mcctrl_put_per_thread_data(ptd);
pr_ptd("put", task_pid_vnr(current), ptd);
no_ptd:
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu, error: %d\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason, error);
mcctrl_put_per_proc_data(ppd);
__func__, task_pid_vnr(current), fault_addr,
(unsigned long)reason, error);
return error;
}
@ -704,7 +539,11 @@ out_put_ppd:
#define USE_VM_INSERT_PFN 1
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0)
#if defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(8, 2)
static vm_fault_t rus_vm_fault(struct vm_fault *vmf)
#else
static int rus_vm_fault(struct vm_fault *vmf)
#endif
{
struct vm_area_struct *vma = vmf->vma;
#else
@ -726,70 +565,70 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#endif
struct mcctrl_per_proc_data *ppd;
struct mcctrl_per_thread_data *ptd;
struct ikc_scd_packet *packet;
struct task_struct *task = current;
struct ikc_scd_packet packet = { };
unsigned long rsysnum = 0;
int ret = 0;
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %#lx page %p\n",
vmf->flags, vmf->pgoff, vmf->address, vmf->page);
unsigned long addr = vmf->address;
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
dprintk("mcctrl:page fault:flags %#x pgoff %#lx va %p page %p\n",
vmf->flags, vmf->pgoff, vmf->virtual_address, vmf->page);
void __user *addr = vmf->virtual_address;
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
/* Look up per-process structure */
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
if (!ppd) {
kprintf("%s: INFO: no per-process structure for pid %d (tid %d), try to use pid %d\n",
__FUNCTION__, task_tgid_vnr(current), task_pid_vnr(current), vma->vm_mm->owner->pid);
ppd = mcctrl_get_per_proc_data(usrdata, vma->vm_mm->owner->pid);
pr_err("%s: INFO: no per-process structure for "
"pid %d (tid %d), trying to use pid %d\n",
__func__,
task_tgid_vnr(task), task_pid_vnr(task),
vma->vm_mm->owner->pid);
task = vma->vm_mm->owner;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(task));
}
if (!ppd) {
kprintf("%s: ERROR: no per-process structure for PID %d??\n",
__FUNCTION__, task_tgid_vnr(current));
pr_err("%s: ERROR: no per-process structure for PID %d??\n",
__func__, task_tgid_vnr(task));
ret = VM_FAULT_SIGBUS;
goto no_ppd;
}
packet.fault_tid = ppd->pid;
ptd = mcctrl_get_per_thread_data(ppd, current);
if (!ptd) {
printk("%s: ERROR: mcctrl_get_per_thread_data failed\n", __FUNCTION__);
ret = VM_FAULT_SIGBUS;
goto no_ptd;
ptd = mcctrl_get_per_thread_data(ppd, task);
if (ptd) {
struct ikc_scd_packet *ptd_packet;
pr_ptd("get", task_pid_vnr(task), ptd);
ptd_packet = (struct ikc_scd_packet *)ptd->data;
if (ptd_packet) {
packet.target_cpu = ptd_packet->ref;
packet.fault_tid = ptd_packet->req.rtid;
rsysnum = ptd_packet->req.number;
}
mcctrl_put_per_thread_data(ptd);
pr_ptd("put", task_pid_vnr(task), ptd);
}
pr_ptd("get", task_pid_vnr(current), ptd);
packet = (struct ikc_scd_packet *)ptd->data;
if (!packet) {
/* Don't even bother looking up NULL */
if (!addr) {
pr_warn("%s: WARNING: attempted NULL pointer access\n",
__func__);
ret = VM_FAULT_SIGBUS;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto put_and_out;
}
for (try = 1; ; ++try) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
vmf->address, &rpa, &pgsize);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
error = translate_rva_to_rpa(usrdata->os, ppd->rpgtable,
(unsigned long)vmf->virtual_address,
&rpa, &pgsize);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
(unsigned long)addr, &rpa, &pgsize);
#define NTRIES 2
if (!error || (try >= NTRIES)) {
if (error) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
printk("%s: error translating 0x%#lx "
"(req: TID: %u, syscall: %lu)\n",
__FUNCTION__, vmf->address,
packet->req.rtid, packet->req.number);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
printk("%s: error translating 0x%p "
"(req: TID: %u, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
pr_err("%s: error translating 0x%#lx "
"(req: TID: %u, syscall: %lu)\n",
__func__,
(unsigned long)addr,
packet.fault_tid, rsysnum);
}
break;
@ -800,23 +639,14 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#define PF_WRITE 0x02
reason |= PF_WRITE;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
error = remote_page_fault(usrdata, (void *)vmf->address, reason);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
error = remote_page_fault(usrdata, vmf->virtual_address, reason);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
error = remote_page_fault(usrdata, (void *)addr,
reason, ppd, &packet);
if (error) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
printk("%s: error forwarding PF for 0x%#lx "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->address,
packet->req.rtid, packet->req.number);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
printk("%s: error forwarding PF for 0x%p "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
pr_err("%s: error forwarding PF for 0x%#lx "
"(req: TID: %d, syscall: %lu)\n",
__func__,
(unsigned long)addr,
packet.fault_tid, rsysnum);
break;
}
}
@ -825,11 +655,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
goto put_and_out;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
rva = vmf->address & ~(pgsize - 1);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
rva = (unsigned long)vmf->virtual_address & ~(pgsize - 1);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
rva = (unsigned long)addr & ~(pgsize - 1);
rpa = rpa & ~(pgsize - 1);
phys = ihk_device_map_memory(dev, rpa, pgsize);
@ -849,21 +675,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
error = vm_insert_page(vma, rva+(pix*PAGE_SIZE), page);
if (error) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
printk("%s: error inserting mapping for 0x%#lx "
"(req: TID: %d, syscall: %lu) error: %d, "
"vm_start: 0x%lx, vm_end: 0x%lx\n",
__FUNCTION__, vmf->address,
packet->req.rtid, packet->req.number, error,
vma->vm_start, vma->vm_end);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
printk("%s: error inserting mapping for 0x%p "
"(req: TID: %d, syscall: %lu) error: %d, "
"vm_start: 0x%lx, vm_end: 0x%lx\n",
__FUNCTION__, vmf->virtual_address,
packet->req.rtid, packet->req.number, error,
vma->vm_start, vma->vm_end);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
pr_err("%s: error inserting mapping for 0x%#lx "
"(req: TID: %d, syscall: %lu) error: %d,"
" vm_start: 0x%lx, vm_end: 0x%lx\n",
__func__,
(unsigned long)addr, packet.fault_tid,
rsysnum, error,
vma->vm_start, vma->vm_end);
}
}
else
@ -875,16 +693,13 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
pfn+pix);
#endif
if (error) {
#if 1 /* POSTK_DEBUG_TEMP_FIX_11 */ /* rus_vm_fault() multi-thread fix */
printk("%s: vm_insert_pfn returned %d\n", __FUNCTION__, error);
pr_err("%s: vm_insert_pfn returned %d\n",
__func__, error);
if (error == -EBUSY) {
error = 0;
} else {
break;
}
#else /* POSTK_DEBUG_TEMP_FIX_11 */
break;
#endif /* POSTK_DEBUG_TEMP_FIX_11 */
}
}
#else
@ -892,17 +707,11 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
#endif
ihk_device_unmap_memory(dev, phys, pgsize);
if (error) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0)
printk("%s: remote PF failed for 0x%#lx, pgoff: %lu "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->address, vmf->pgoff,
packet->req.rtid, packet->req.number);
#else /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
printk("%s: remote PF failed for 0x%p, pgoff: %lu "
"(req: TID: %d, syscall: %lu)\n",
__FUNCTION__, vmf->virtual_address, vmf->pgoff,
packet->req.rtid, packet->req.number);
#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) */
pr_err("%s: remote PF failed for 0x%#lx, pgoff: %lu"
" (req: TID: %d, syscall: %lu)\n",
__func__,
(unsigned long)addr, vmf->pgoff,
packet.fault_tid, rsysnum);
ret = VM_FAULT_SIGBUS;
goto put_and_out;
}
@ -910,9 +719,6 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE;
put_and_out:
mcctrl_put_per_thread_data(ptd);
pr_ptd("put", task_pid_vnr(current), ptd);
no_ptd:
mcctrl_put_per_proc_data(ppd);
no_ppd:
return ret;
@ -1075,6 +881,7 @@ struct pager_create_result {
int maxprot;
uint32_t flags;
size_t size;
int pgshift;
char path[PATH_MAX];
};
@ -1136,6 +943,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
struct kstat st;
int mf_flags = 0;
unsigned long irqflags;
int pgshift = 0;
dprintk("pager_req_create(%d,%lx)\n", fd, (long)result_pa);
@ -1144,8 +952,16 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
printk("pager_req_create(%d,%lx):vfs_stat failed. %d\n", fd, (long)result_pa, error);
goto out;
}
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
/* treat memory devices as regular files */
if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1) &&
(MINOR(st.rdev) == 1 || // /dev/mem
MINOR(st.rdev) == 5)) { // /dev/zero
/* treat memory devices and zero devices as regular files */
}
else if (S_ISCHR(st.mode) && (MAJOR(st.rdev) == 1)) {
error = -ENODEV;
dprintk("%s(%d,%lx):unmappable device %x\n",
__func__, fd, (long)result_pa, st.mode);
goto out;
}
else if (!S_ISREG(st.mode)) {
error = -ESRCH;
@ -1160,6 +976,30 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
goto out;
}
/* Shared memory hack */
{
char *pathbuf, *fullpath;
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
if (pathbuf) {
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
!strncmp("/dev/shm/", fullpath, 9) ||
(!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
fullpath, 30) && !strstr(fullpath, "dstore_sm.lock"))) {
printk("%s: treating %s as a device file..\n",
__func__, fullpath);
kfree(pathbuf);
error = -ESRCH;
goto out;
}
kfree(pathbuf);
}
}
}
inode = file->f_path.dentry->d_inode;
if (!inode) {
error = -EBADF;
@ -1167,6 +1007,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
goto out;
}
if (!strcmp(inode->i_sb->s_type->name, "tmpfs")) {
mf_flags = MF_IS_REMOVABLE;
}
if (!strcmp(inode->i_sb->s_type->name, "proc")) {
error = -ESRCH;
goto out;
@ -1188,13 +1032,14 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
}
if (inode->i_op == mcctrl_hugetlbfs_inode_operations) {
struct hstate *h = hstate_file(file);
pgshift = PAGE_SHIFT + huge_page_order(h);
mf_flags = MF_HUGETLBFS;
/* pager is used as handle id on mckernel side, use inode */
pager = (void *)st.ino;
/* retrofit blksize in resp as well through st.size field;
* the actual file size is not used
*/
st.size = st.blksize;
/* file size is not used */
st.size = 0;
goto out_reply;
}
@ -1214,7 +1059,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
pager = newpager;
newpager = NULL;
/* Intel MPI library and shared memory "prefetch" */
/* Shared libraries prefetch */
{
char *pathbuf, *fullpath;
@ -1222,15 +1067,7 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
if (pathbuf) {
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
if (!strncmp("/dev/shm/Intel_MPI", fullpath, 18)) {
mf_flags = (MF_PREMAP | MF_ZEROFILL);
dprintk("%s: filename: %s, premap & zerofill\n",
__FUNCTION__, fullpath);
}
else if (strstr(fullpath, "libmpi") ||
strstr(fullpath, "libiomp") ||
strstr(fullpath, "libpthread") ||
strstr(fullpath, "libc.so")) {
if (strstr(fullpath, ".so")) {
mf_flags = MF_PREFETCH;
dprintk("%s: filename: %s, prefetch\n",
__FUNCTION__, fullpath);
@ -1281,6 +1118,7 @@ out_reply:
resp->maxprot = maxprot;
resp->flags = mf_flags;
resp->size = st.size;
resp->pgshift = pgshift;
error = pager_get_path(file, resp->path);
@ -1355,8 +1193,9 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
uintptr_t phys = -1;
ihk_device_t dev = ihk_os_to_dev(os);
void *buf = NULL;
loff_t pos;
loff_t pos, fsize;
unsigned long flags;
unsigned int major, minor;
dprintk("pager_req_read(%lx,%lx,%lx,%lx)\n", handle, off, size, rpa);
@ -1378,6 +1217,21 @@ static int pager_req_read(ihk_os_t os, uintptr_t handle, off_t off, size_t size,
goto out;
}
major = MAJOR(file->f_mapping->host->i_rdev);
minor = MINOR(file->f_mapping->host->i_rdev);
if ((major == 1 && minor == 1) || // /dev/mem
(major == 1 && minor == 5)) { // /dev/zero
/* Nothing to check */
}
else {
/* Check if the target page fits in the file */
fsize = i_size_read(file->f_mapping->host);
if (off > fsize) {
ss = 0;
goto out;
}
}
phys = ihk_device_map_memory(dev, rpa, size);
buf = ihk_device_map_virtual(dev, phys, size, NULL, 0);
if (!buf) {
@ -1585,6 +1439,26 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
#define ANY_WHERE 0
if (prot_and_flags & MAP_LOCKED) prot_and_flags |= MAP_POPULATE;
/* Shared memory hack */
{
char *pathbuf, *fullpath;
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
if (pathbuf) {
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
if (!strncmp("/tmp/ompi.", fullpath, 10) ||
!strncmp("/dev/shm/", fullpath, 9) ||
!strncmp("/var/opt/FJSVtcs/ple/daemonif/",
fullpath, 30)) {
dprintk("%s: pre-populating %s..\n",
__func__, fullpath);
prot_and_flags |= MAP_POPULATE;
}
kfree(pathbuf);
}
}
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
down_write(&current->mm->mmap_sem);
@ -1598,7 +1472,12 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
#endif
if (IS_ERR_VALUE(va)) {
printk("pager_req_map(%p,%d,%lx,%lx,%lx):do_mmap_pgoff failed. %d\n", os, fd, len, off, result_rpa, (int)va);
if ((int)va != -ENOTSUPP) {
pr_err("%s(%p,%d,%lx,%lx,%lx): "
"do_mmap_pgoff failed. %d\n",
__func__, os, fd, len, off, result_rpa,
(int)va);
}
error = va;
goto out;
}
@ -1712,16 +1591,9 @@ retry:
pfn |= PFN_VALID | PFN_PRESENT;
/* Check if mapping is write-combined */
#ifdef POSTK_DEBUG_ARCH_DEP_12
if (pte_is_write_combined(*pte)) {
pfn |= PFN_WRITE_COMBINED;
}
#else /* POSTK_DEBUG_ARCH_DEP_12 */
if ((pte_flags(*pte) & _PAGE_PWT) &&
!(pte_flags(*pte) & _PAGE_PCD)) {
pfn |= _PAGE_PWT;
}
#endif /* POSTK_DEBUG_ARCH_DEP_12 */
}
pte_unmap(pte);
}
@ -1754,10 +1626,27 @@ retry:
#else
fault = handle_mm_fault(current->mm, vma, va, flags);
#endif
#ifdef SC_DEBUG
if (fault != 0) {
printk("%s: error: faulting %lx at off: %lu\n",
__FUNCTION__, va, off);
char *pathbuf = NULL;
char *fullpath;
if (vma->vm_file) {
pathbuf = kmalloc(PATH_MAX, GFP_ATOMIC);
if (pathbuf) {
fullpath = d_path(&vma->vm_file->f_path,
pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
printk("%s: WARNING: couldn't fault 0x%lx"
" at off: %lu in %s\n",
__FUNCTION__, va, off, fullpath);
}
kfree(pathbuf);
}
}
}
#endif
page_fault_attempted = 1;
goto retry;
@ -2052,6 +1941,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
}
if (addr < end) {
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0)
/* Revert permission */
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
error = zap_vma_ptes(vma, addr, end-addr);
if (error) {
mcctrl_zap_page_range(vma, addr, end-addr,
@ -2069,6 +1960,8 @@ int mcctrl_clear_pte_range(uintptr_t start, uintptr_t len)
NULL);
}
else {
/* Revert permission */
vma->vm_flags |= VM_READ | VM_WRITE | VM_EXEC;
zap_vma_ptes(vma, addr, end-addr);
}
#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4, 18, 0) */
@ -2124,7 +2017,10 @@ int release_user_space(uintptr_t start, uintptr_t len)
* \param chunks The number of chunks which make a core file image in the whole.
*/
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks,
unsigned long cmdline_rphys, unsigned long cmdline_len)
{
char *fn = NULL;
struct file *file;
struct coretable *coretable;
int i, tablesize, error = 0;
@ -2133,22 +2029,43 @@ static int writecore(ihk_os_t os, unsigned long rcoretable, int chunks) {
unsigned long phys, tablephys, rphys;
ihk_device_t dev = ihk_os_to_dev(os);
char *pt;
unsigned long cmdline_phys;
char *cmdline;
dprintk("coredump called as a pseudo syscall\n");
fn = kmalloc(PATH_MAX, GFP_ATOMIC);
if (!fn) {
dprintk("%s: ERROR: allocating file name\n", __func__);
error = -ENOMEM;
goto fail;
}
if (chunks <= 0) {
dprintk("no core data found!(%d)\n", chunks);
error = -EINVAL;
goto fail;
}
cmdline_phys = ihk_device_map_memory(dev, cmdline_rphys, cmdline_len);
cmdline = ihk_device_map_virtual(dev, cmdline_phys, cmdline_len, NULL,
0);
sprintf(fn, "mccore-%s.%d",
strrchr(cmdline, '/') ?
strrchr(cmdline, '/') + 1 : cmdline,
task_tgid_vnr(current));
pr_info("%s: fn=%s\n", __func__, fn);
ihk_device_unmap_virtual(dev, cmdline, cmdline_len);
ihk_device_unmap_memory(dev, cmdline_phys, cmdline_len);
/* Every Linux documentation insists we should not
* open a file in the kernel module, but our karma
* leads us here. Precisely, Here we emulate the core
* dump routine of the Linux kernel in linux/fs/exec.c.
* So we have a legitimate reason to do this.
*/
file = filp_open("core", O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
file = filp_open(fn, O_CREAT | O_RDWR | O_LARGEFILE | O_TRUNC, 0600);
if (IS_ERR(file) || !file->f_op) {
dprintk("cannot open core file\n");
error = PTR_ERR(file);
@ -2218,6 +2135,7 @@ fail:
/* make sure we do not travel to user land */
error = -EINVAL;
}
kfree(fn);
return error;
}
@ -2273,7 +2191,8 @@ int __do_in_kernel_syscall(ihk_os_t os, struct ikc_scd_packet *packet)
}
case __NR_coredump:
ret = writecore(os, sc->args[1], sc->args[0]);
ret = writecore(os, sc->args[1], sc->args[0], sc->args[2],
sc->args[3]);
break;
case __NR_sched_setparam: {

View File

@ -157,13 +157,8 @@ static void free_node_topology(struct mcctrl_usrdata *udp)
return;
} /* free_node_topology() */
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
struct mcctrl_cpu_topology *cpu)
#else /* POSTK_DEBUG_ARCH_DEP_40 */
static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
{
struct cache_topology *cache;
struct cache_topology *next;
@ -179,13 +174,8 @@ static void free_cpu_topology_one(struct mcctrl_usrdata *udp,
static void free_cpu_topology(struct mcctrl_usrdata *udp)
{
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *cpu;
struct mcctrl_cpu_topology *next;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *cpu;
struct cpu_topology *next;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
list_for_each_entry_safe(cpu, next, &udp->cpu_topology_list, chain) {
list_del(&cpu->chain);
@ -315,13 +305,8 @@ static int translate_cpumap(struct mcctrl_usrdata *udp,
return error;
} /* translate_cpumap() */
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
struct mcctrl_cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
#else /* POSTK_DEBUG_ARCH_DEP_40 */
static struct cache_topology *get_cache_topology(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu_topo, struct ihk_cache_topology *saved)
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
{
int error;
struct cache_topology *topo = NULL;
@ -355,21 +340,12 @@ out:
return (error)? ERR_PTR(error): topo;
} /* get_cache_topology() */
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
static struct mcctrl_cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
#else /* POSTK_DEBUG_ARCH_DEP_40 */
static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
int index)
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
{
int error;
ihk_device_t dev = ihk_os_to_dev(udp->os);
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *topology = NULL;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *topology = NULL;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
struct cache_topology *cache;
struct ihk_cache_topology *saved_cache;
@ -387,12 +363,8 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
topology->saved = ihk_device_get_cpu_topology(dev,
mckernel_cpu_2_hw_id(udp, index));
#ifdef POSTK_DEBUG_TEMP_FIX_21 /* IS_ERR() through return NULL */
if (!topology->saved) {
#else /* POSTK_DEBUG_TEMP_FIX_21 */
if (IS_ERR(topology->saved)) {
#endif /* POSTK_DEBUG_TEMP_FIX_21 */
error = PTR_ERR(topology->saved);
error = -ENOENT;
eprintk("mcctrl:get_one_cpu_topology:"
"ihk_device_get_cpu_topology failed. %d\n",
error);
@ -428,6 +400,9 @@ static struct cpu_topology *get_one_cpu_topology(struct mcctrl_usrdata *udp,
"get_cache_topology failed. %d\n",
error);
goto out;
} else if (!cache) {
error = -ENOENT;
goto out;
}
list_add(&cache->chain, &topology->cache_list);
@ -447,11 +422,7 @@ static int get_cpu_topology(struct mcctrl_usrdata *udp)
{
int error;
int index;
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *topology;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *topology;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
dprintk("get_cpu_topology(%p)\n", udp);
for (index = 0; index < udp->cpu_info->n_cpus; ++index) {
@ -473,13 +444,8 @@ out:
return error;
} /* get_cpu_topology() */
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct mcctrl_cpu_topology *cpu, struct cache_topology *cache)
#else /* POSTK_DEBUG_ARCH_DEP_40 */
static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu, struct cache_topology *cache)
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->mckernel_cpu_id;
@ -531,13 +497,8 @@ static void setup_cpu_sysfs_cache_files(struct mcctrl_usrdata *udp,
return;
} /* setup_cpu_sysfs_cache_files() */
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct mcctrl_cpu_topology *cpu)
#else /* POSTK_DEBUG_ARCH_DEP_40 */
static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
struct cpu_topology *cpu)
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
{
char *prefix = "/sys/devices/system/cpu";
int cpu_number = cpu->mckernel_cpu_id;
@ -586,11 +547,7 @@ static void setup_cpu_sysfs_files(struct mcctrl_usrdata *udp,
static void setup_cpus_sysfs_files(struct mcctrl_usrdata *udp)
{
int error;
#ifdef POSTK_DEBUG_ARCH_DEP_40 /* cpu_topology name change */
struct mcctrl_cpu_topology *cpu;
#else /* POSTK_DEBUG_ARCH_DEP_40 */
struct cpu_topology *cpu;
#endif /* POSTK_DEBUG_ARCH_DEP_40 */
error = get_cpu_topology(udp);
if (error) {

View File

@ -1,14 +0,0 @@
# LESS/GREATER_EQUAL appears somewhere in 3.7... meh compat until we stop caring about 2.x
# ...apparently can't define macros ot use inside if, so unfold manually
if(NOT (LINUX_VERSION_CODE LESS 262144) AND NOT (LINUX_VERSION_CODE GREATER 262400))
add_subdirectory("linux-4.0.9")
elseif(NOT (LINUX_VERSION_CODE LESS 263680) AND NOT (LINUX_VERSION_CODE GREATER 263936))
add_subdirectory("linux-4.6.7")
elseif(LINUX_VERSION_CODE EQUAL 199168)
add_subdirectory("linux-3.10.0-327.36.1.el7")
else()
#add_subdirectory("linux-3.10.0-327.36.1.el7")
add_subdirectory("linux-4.18.14")
#message(FATAL_ERROR "mcoverlayfs enabled but kernel version not compatible")
endif()

View File

@ -1,7 +0,0 @@
kmod(mcoverlay
SOURCES
copy_up.c dir.c inode.c readdir.c super.c
INSTALL_DEST
${KMODDIR}
)

View File

@ -1,461 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static unsigned ovl_check_copy_up = 1;
module_param_named(check_copy_up, ovl_check_copy_up, uint,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned fd)
{
const struct dentry *dentry = data;
if (f->f_path.dentry == dentry)
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_do_check_copy_up(lowerpath->dentry);
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -1,972 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (S_ISDIR(wdentry->d_inode->i_mode))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
int flags = 0;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if ((opaquedir && upper != opaquedir) ||
(!opaquedir && ovl_dentry_upper(dentry) &&
upper != ovl_dentry_upper(dentry))) {
goto out_dput_upper;
}
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_dput_upper;
if (d_is_dir(upper))
flags = RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (flags)
ovl_cleanup(wdir, upper);
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_dput_upper:
dput(upper);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper;
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_unlock;
err = -ESTALE;
if (upper == ovl_dentry_upper(dentry)) {
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
ovl_dentry_version_inc(dentry->d_parent);
}
dput(upper);
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
if (!err)
d_drop(dentry);
out_unlock:
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = S_ISDIR(old->d_inode->i_mode);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (S_ISDIR(new->d_inode->i_mode))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = lookup_one_len(old->d_name.name, old_upperdir,
old->d_name.len);
err = PTR_ERR(olddentry);
if (IS_ERR(olddentry))
goto out_unlock;
err = -ESTALE;
if (olddentry != ovl_dentry_upper(old))
goto out_dput_old;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_dput_old;
err = -ESTALE;
if (ovl_dentry_upper(new)) {
if (opaquedir) {
if (newdentry != opaquedir)
goto out_dput;
} else {
if (newdentry != ovl_dentry_upper(new))
goto out_dput;
}
} else {
new_create = true;
if (!d_is_negative(newdentry) &&
(!new_opaque || !ovl_is_whiteout(newdentry)))
goto out_dput;
}
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_dput_old:
dput(olddentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
static int ovl_rename(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new)
{
return ovl_rename2(olddir, old, newdir, new, 0);
}
const struct inode_operations_wrapper ovl_dir_inode_operations = {
.ops = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename = ovl_rename,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.rename2 = ovl_rename2,
};

View File

@ -1,442 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
struct ovl_link_data *data = NULL;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
if (realinode->i_op->put_link) {
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
data->realdentry = realdentry;
}
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret)) {
kfree(data);
return ret;
}
if (data)
data->cookie = ret;
return data;
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type,
realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations_wrapper ovl_file_inode_operations = {
.ops = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
},
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations.ops;
inode->i_fop = &ovl_dir_operations;
inode->i_flags |= S_IOPS_WRAPPER;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations.ops;
inode->i_flags |= S_IOPS_WRAPPER;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -1,200 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations_wrapper ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -1,626 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include <linux/version.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
struct ovl_cache_entry *next_maybe_whiteout;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
/* vfs_readdir vs. iterate_dir compat */
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0) || \
(defined(RHEL_RELEASE_CODE) && RHEL_RELEASE_CODE >= RHEL_RELEASE_VERSION(7, 5))
#define USE_ITERATE_DIR 1
#endif
#ifndef USE_ITERATE_DIR
struct dir_context {
const filldir_t actor;
//loff_t pos;
};
#endif
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct ovl_cache_entry *first_maybe_whiteout;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
p->next_maybe_whiteout = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p;
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(void *buf, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct dir_context *ctx = buf;
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
{
int err;
struct ovl_cache_entry *p;
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred)
return -ENOMEM;
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
err = mutex_lock_killable(&dir->d_inode->i_mutex);
if (!err) {
while (rdd->first_maybe_whiteout) {
p = rdd->first_maybe_whiteout;
rdd->first_maybe_whiteout = p->next_maybe_whiteout;
dentry = lookup_one_len(p->name, dir, p->len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
}
mutex_unlock(&dir->d_inode->i_mutex);
}
revert_creds(old_cred);
put_cred(override_cred);
return err;
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->first_maybe_whiteout = NULL;
//rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
#ifdef USE_ITERATE_DIR
err = iterate_dir(realfile, &rdd->ctx);
#else
err = vfs_readdir(realfile, rdd->ctx.actor, rdd);
#endif
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
if (!err && rdd->first_maybe_whiteout)
err = ovl_check_whiteouts(realpath->dentry, rdd);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
#ifdef USE_ITERATE_DIR
struct iterate_wrapper {
struct dir_context ctx;
filldir_t actor;
void *buf;
};
static int ovl_wrap_readdir(void *ctx, const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct iterate_wrapper *w = ctx;
return w->actor(w->buf, name, namelen, offset, ino, d_type);
}
#endif
static int ovl_readdir(struct file *file, void *buf, filldir_t filler)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
int res;
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
#ifdef USE_ITERATE_DIR
struct iterate_wrapper w = {
.ctx.actor = ovl_wrap_readdir,
.actor = filler,
.buf = buf,
};
res = iterate_dir(od->realfile, &w.ctx);
#else
res = vfs_readdir(od->realfile, filler, buf);
#endif
file->f_pos = od->realfile->f_pos;
return res;
}
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, file->f_pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (filler(buf, p->name, p->len, file->f_pos, p->ino, p->type))
break;
od->cursor = p->l_node.next;
file->f_pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.readdir = ovl_readdir,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,416 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched.h>
#include <linux/namei.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size;
char *buf, *name, *value;
int error;
if (!old->d_inode->i_op->getxattr ||
!new->d_inode->i_op->getxattr)
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
error = -ENOMEM;
value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL);
if (!value)
goto out;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out_free_value;
}
for (name = buf; name < (buf + list_size); name += strlen(name) + 1) {
size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX);
if (size <= 0) {
error = size;
goto out_free_value;
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
goto out_free_value;
}
out_free_value:
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static char *ovl_read_symlink(struct dentry *realdentry)
{
int res;
char *buf;
struct inode *inode = realdentry->d_inode;
mm_segment_t old_fs;
res = -EINVAL;
if (!inode->i_op->readlink)
goto err;
res = -ENOMEM;
buf = (char *) __get_free_page(GFP_KERNEL);
if (!buf)
goto err;
old_fs = get_fs();
set_fs(get_ds());
/* The cast to a user pointer is valid due to the set_fs() */
res = inode->i_op->readlink(realdentry,
(char __user *)buf, PAGE_SIZE - 1);
set_fs(old_fs);
if (res < 0) {
free_page((unsigned long) buf);
goto err;
}
buf[res] = '\0';
return buf;
err:
return ERR_PTR(res);
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
struct dentry *dentry, struct path *lowerpath,
struct kstat *stat, struct iattr *attr,
const char *link)
{
struct inode *wdir = workdir->d_inode;
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry = NULL;
struct dentry *upper = NULL;
umode_t mode = stat->mode;
int err;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out1;
/* Can't properly set mode on creation because of the umask */
stat->mode &= S_IFMT;
err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
stat->mode = mode;
if (err)
goto out2;
if (S_ISREG(stat->mode)) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = newdentry;
err = ovl_copy_up_data(lowerpath, &upperpath, stat->size);
if (err)
goto out_cleanup;
}
err = ovl_copy_xattr(lowerpath->dentry, newdentry);
if (err)
goto out_cleanup;
mutex_lock(&newdentry->d_inode->i_mutex);
err = ovl_set_attr(newdentry, stat);
if (!err && attr)
err = notify_change(newdentry, attr, NULL);
mutex_unlock(&newdentry->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
ovl_dentry_update(dentry, newdentry);
newdentry = NULL;
/*
* Non-directores become opaque when copied up.
*/
if (!S_ISDIR(stat->mode))
ovl_dentry_set_opaque(dentry, true);
out2:
dput(upper);
out1:
dput(newdentry);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out;
}
/*
* Copy up a single dentry
*
* Directory renames only allowed on "pure upper" (already created on
* upper filesystem, never copied up). Directories which are on lower or
* are merged may not be renamed. For these -EXDEV is returned and
* userspace has to deal with it. This means, when copying up a
* directory we can rely on it and ancestors being stable.
*
* Non-directory renames start with copy up of source if necessary. The
* actual rename will only proceed once the copy up was successful. Copy
* up uses upper parent i_mutex for exclusion. Since rename can change
* d_parent it is possible that the copy up will lock the old parent. At
* that point the file will have already been copied up anyway.
*/
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr)
{
struct dentry *workdir = ovl_workdir(dentry);
int err;
struct kstat pstat;
struct path parentpath;
struct dentry *upperdir;
struct dentry *upperdentry;
const struct cred *old_cred;
struct cred *override_cred;
char *link = NULL;
if (WARN_ON(!workdir))
return -EROFS;
ovl_path_upper(parent, &parentpath);
upperdir = parentpath.dentry;
err = vfs_getattr(&parentpath, &pstat);
if (err)
return err;
if (S_ISLNK(stat->mode)) {
link = ovl_read_symlink(lowerpath->dentry);
if (IS_ERR(link))
return PTR_ERR(link);
}
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_free_link;
override_cred->fsuid = stat->uid;
override_cred->fsgid = stat->gid;
/*
* CAP_SYS_ADMIN for copying up extended attributes
* CAP_DAC_OVERRIDE for create
* CAP_FOWNER for chmod, timestamp update
* CAP_FSETID for chmod
* CAP_CHOWN for chown
* CAP_MKNOD for mknod
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
cap_raise(override_cred->cap_effective, CAP_MKNOD);
old_cred = override_creds(override_cred);
err = -EIO;
if (lock_rename(workdir, upperdir) != NULL) {
pr_err("overlayfs: failed to lock workdir+upperdir\n");
goto out_unlock;
}
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
unlock_rename(workdir, upperdir);
err = 0;
/* Raced with another copy-up? Do the setattr here */
if (attr) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
}
goto out_put_cred;
}
err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath,
stat, attr, link);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &pstat);
}
out_unlock:
unlock_rename(workdir, upperdir);
out_put_cred:
revert_creds(old_cred);
put_cred(override_cred);
out_free_link:
if (link)
free_page((unsigned long) link);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
int err;
err = 0;
while (!err) {
struct dentry *next;
struct dentry *parent;
struct path lowerpath;
struct kstat stat;
enum ovl_path_type type = ovl_path_type(dentry);
if (OVL_TYPE_UPPER(type))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (;;) {
parent = dget_parent(next);
type = ovl_path_type(parent);
if (OVL_TYPE_UPPER(type))
break;
dput(next);
next = parent;
}
ovl_path_lower(next, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (!err)
err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL);
dput(parent);
dput(next);
}
return err;
}

View File

@ -1,951 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
{
int err;
dget(wdentry);
if (d_is_dir(wdentry))
err = ovl_do_rmdir(wdir, wdentry);
else
err = ovl_do_unlink(wdir, wdentry);
dput(wdentry);
if (err) {
pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n",
wdentry, err);
}
}
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry)
{
struct dentry *temp;
char name[20];
snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry);
temp = lookup_one_len(name, workdir, strlen(name));
if (!IS_ERR(temp) && temp->d_inode) {
pr_err("overlayfs: workdir/%s already exists\n", name);
dput(temp);
temp = ERR_PTR(-EIO);
}
return temp;
}
/* caller holds i_mutex on workdir */
static struct dentry *ovl_whiteout(struct dentry *workdir,
struct dentry *dentry)
{
int err;
struct dentry *whiteout;
struct inode *wdir = workdir->d_inode;
whiteout = ovl_lookup_temp(workdir, dentry);
if (IS_ERR(whiteout))
return whiteout;
err = ovl_do_whiteout(wdir, whiteout);
if (err) {
dput(whiteout);
whiteout = ERR_PTR(err);
}
return whiteout;
}
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug)
{
int err;
if (newdentry->d_inode)
return -ESTALE;
if (hardlink) {
err = ovl_do_link(hardlink, dir, newdentry, debug);
} else {
switch (stat->mode & S_IFMT) {
case S_IFREG:
err = ovl_do_create(dir, newdentry, stat->mode, debug);
break;
case S_IFDIR:
err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
err = ovl_do_mknod(dir, newdentry,
stat->mode, stat->rdev, debug);
break;
case S_IFLNK:
err = ovl_do_symlink(dir, newdentry, link, debug);
break;
default:
err = -EPERM;
}
}
if (!err && WARN_ON(!newdentry->d_inode)) {
/*
* Not quite sure if non-instantiated dentry is legal or not.
* VFS doesn't seem to care so check and warn here.
*/
err = -ENOENT;
}
return err;
}
static int ovl_set_opaque(struct dentry *upperdentry)
{
return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
}
static void ovl_remove_opaque(struct dentry *upperdentry)
{
int err;
err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
if (err) {
pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
upperdentry->d_name.name, err);
}
}
static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
int err;
enum ovl_path_type type;
struct path realpath;
type = ovl_path_real(dentry, &realpath);
err = vfs_getattr(&realpath, stat);
if (err)
return err;
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (OVL_TYPE_MERGE(type))
stat->nlink = 1;
return 0;
}
static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *newdentry;
int err;
mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
newdentry = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
if (err)
goto out_dput;
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput:
dput(newdentry);
out_unlock:
mutex_unlock(&udir->i_mutex);
return err;
}
static int ovl_lock_rename_workdir(struct dentry *workdir,
struct dentry *upperdir)
{
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
if (lock_rename(workdir, upperdir) != NULL)
goto err_unlock;
return 0;
err_unlock:
unlock_rename(workdir, upperdir);
err:
pr_err("overlayfs: failed to lock workdir+upperdir\n");
return -EIO;
}
static struct dentry *ovl_clear_empty(struct dentry *dentry,
struct list_head *list)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct path upperpath;
struct dentry *upper;
struct dentry *opaquedir;
struct kstat stat;
int err;
if (WARN_ON(!workdir))
return ERR_PTR(-EROFS);
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
ovl_path_upper(dentry, &upperpath);
err = vfs_getattr(&upperpath, &stat);
if (err)
goto out_unlock;
err = -ESTALE;
if (!S_ISDIR(stat.mode))
goto out_unlock;
upper = upperpath.dentry;
if (upper->d_parent->d_inode != udir)
goto out_unlock;
opaquedir = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out_unlock;
err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
if (err)
goto out_dput;
err = ovl_copy_xattr(upper, opaquedir);
if (err)
goto out_cleanup;
err = ovl_set_opaque(opaquedir);
if (err)
goto out_cleanup;
mutex_lock(&opaquedir->d_inode->i_mutex);
err = ovl_set_attr(opaquedir, &stat);
mutex_unlock(&opaquedir->d_inode->i_mutex);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup_whiteouts(upper, list);
ovl_cleanup(wdir, upper);
unlock_rename(workdir, upperdir);
/* dentry's upper doesn't match now, get rid of it */
d_drop(dentry);
return opaquedir;
out_cleanup:
ovl_cleanup(wdir, opaquedir);
out_dput:
dput(opaquedir);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return ERR_PTR(err);
}
static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry)
{
int err;
struct dentry *ret = NULL;
LIST_HEAD(list);
err = ovl_check_empty_dir(dentry, &list);
if (err)
ret = ERR_PTR(err);
else {
/*
* If no upperdentry then skip clearing whiteouts.
*
* Can race with copy-up, since we don't hold the upperdir
* mutex. Doesn't matter, since copy-up can't create a
* non-empty directory from an empty one.
*/
if (ovl_dentry_upper(dentry))
ret = ovl_clear_empty(dentry, &list);
}
ovl_cache_free(&list);
return ret;
}
static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
struct kstat *stat, const char *link,
struct dentry *hardlink)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *upper;
struct dentry *newdentry;
int err;
if (WARN_ON(!workdir))
return -EROFS;
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out;
newdentry = ovl_lookup_temp(workdir, dentry);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto out_dput;
err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
if (err)
goto out_dput2;
if (S_ISDIR(stat->mode)) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_cleanup;
err = ovl_do_rename(wdir, newdentry, udir, upper,
RENAME_EXCHANGE);
if (err)
goto out_cleanup;
ovl_cleanup(wdir, upper);
} else {
err = ovl_do_rename(wdir, newdentry, udir, upper, 0);
if (err)
goto out_cleanup;
}
ovl_dentry_version_inc(dentry->d_parent);
ovl_dentry_update(dentry, newdentry);
ovl_copyattr(newdentry->d_inode, inode);
d_instantiate(dentry, inode);
newdentry = NULL;
out_dput2:
dput(upper);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(workdir, upperdir);
out:
return err;
out_cleanup:
ovl_cleanup(wdir, newdentry);
goto out_dput2;
}
static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev,
const char *link, struct dentry *hardlink)
{
int err;
struct inode *inode;
struct kstat stat = {
.mode = mode,
.rdev = rdev,
};
err = -ENOMEM;
inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata);
if (!inode)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_iput;
if (!ovl_dentry_is_opaque(dentry)) {
err = ovl_create_upper(dentry, inode, &stat, link, hardlink);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_iput;
/*
* CAP_SYS_ADMIN for setting opaque xattr
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
old_cred = override_creds(override_cred);
err = ovl_create_over_whiteout(dentry, inode, &stat, link,
hardlink);
revert_creds(old_cred);
put_cred(override_cred);
}
if (!err)
inode = NULL;
out_iput:
iput(inode);
out:
return err;
}
static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
const char *link)
{
int err;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_create_or_link(dentry, mode, rdev, link, NULL);
ovl_drop_write(dentry);
}
return err;
}
static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool excl)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL);
}
static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
{
return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL);
}
static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
dev_t rdev)
{
/* Don't allow creation of "whiteout" on overlay */
if (S_ISCHR(mode) && rdev == WHITEOUT_DEV)
return -EPERM;
return ovl_create_object(dentry, mode, rdev, NULL);
}
static int ovl_symlink(struct inode *dir, struct dentry *dentry,
const char *link)
{
return ovl_create_object(dentry, S_IFLNK, 0, link);
}
static int ovl_link(struct dentry *old, struct inode *newdir,
struct dentry *new)
{
int err;
struct dentry *upper;
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
upper = ovl_dentry_upper(old);
err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper);
out_drop_write:
ovl_drop_write(old);
out:
return err;
}
static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir)
{
struct dentry *workdir = ovl_workdir(dentry);
struct inode *wdir = workdir->d_inode;
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *udir = upperdir->d_inode;
struct dentry *whiteout;
struct dentry *upper;
struct dentry *opaquedir = NULL;
int err;
if (WARN_ON(!workdir))
return -EROFS;
if (is_dir) {
if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) {
opaquedir = ovl_check_empty_and_clear(dentry);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir))
goto out;
} else {
LIST_HEAD(list);
/*
* When removing an empty opaque directory, then it
* makes no sense to replace it with an exact replica of
* itself. But emptiness still needs to be checked.
*/
err = ovl_check_empty_dir(dentry, &list);
ovl_cache_free(&list);
if (err)
goto out;
}
}
err = ovl_lock_rename_workdir(workdir, upperdir);
if (err)
goto out_dput;
whiteout = ovl_whiteout(workdir, dentry);
err = PTR_ERR(whiteout);
if (IS_ERR(whiteout))
goto out_unlock;
upper = ovl_dentry_upper(dentry);
if (!upper) {
upper = lookup_one_len(dentry->d_name.name, upperdir,
dentry->d_name.len);
err = PTR_ERR(upper);
if (IS_ERR(upper))
goto kill_whiteout;
err = ovl_do_rename(wdir, whiteout, udir, upper, 0);
dput(upper);
if (err)
goto kill_whiteout;
} else {
int flags = 0;
if (opaquedir)
upper = opaquedir;
err = -ESTALE;
if (upper->d_parent != upperdir)
goto kill_whiteout;
if (is_dir)
flags |= RENAME_EXCHANGE;
err = ovl_do_rename(wdir, whiteout, udir, upper, flags);
if (err)
goto kill_whiteout;
if (is_dir)
ovl_cleanup(wdir, upper);
}
ovl_dentry_version_inc(dentry->d_parent);
out_d_drop:
d_drop(dentry);
dput(whiteout);
out_unlock:
unlock_rename(workdir, upperdir);
out_dput:
dput(opaquedir);
out:
return err;
kill_whiteout:
ovl_cleanup(wdir, whiteout);
goto out_d_drop;
}
static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
{
struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
struct inode *dir = upperdir->d_inode;
struct dentry *upper = ovl_dentry_upper(dentry);
int err;
mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
err = -ESTALE;
if (upper->d_parent == upperdir) {
/* Don't let d_delete() think it can reset d_inode */
dget(upper);
if (is_dir)
err = vfs_rmdir(dir, upper);
else
err = vfs_unlink(dir, upper, NULL);
dput(upper);
ovl_dentry_version_inc(dentry->d_parent);
}
/*
* Keeping this dentry hashed would mean having to release
* upperpath/lowerpath, which could only be done if we are the
* sole user of this dentry. Too tricky... Just unhash for
* now.
*/
d_drop(dentry);
mutex_unlock(&dir->i_mutex);
return err;
}
static inline int ovl_check_sticky(struct dentry *dentry)
{
struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
struct inode *inode = ovl_dentry_real(dentry)->d_inode;
if (check_sticky(dir, inode))
return -EPERM;
return 0;
}
static int ovl_do_remove(struct dentry *dentry, bool is_dir)
{
enum ovl_path_type type;
int err;
err = ovl_check_sticky(dentry);
if (err)
goto out;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry->d_parent);
if (err)
goto out_drop_write;
type = ovl_path_type(dentry);
if (OVL_TYPE_PURE_UPPER(type)) {
err = ovl_remove_upper(dentry, is_dir);
} else {
const struct cred *old_cred;
struct cred *override_cred;
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir, rename
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
err = ovl_remove_and_whiteout(dentry, is_dir);
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_unlink(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, false);
}
static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
{
return ovl_do_remove(dentry, true);
}
static int ovl_rename2(struct inode *olddir, struct dentry *old,
struct inode *newdir, struct dentry *new,
unsigned int flags)
{
int err;
enum ovl_path_type old_type;
enum ovl_path_type new_type;
struct dentry *old_upperdir;
struct dentry *new_upperdir;
struct dentry *olddentry;
struct dentry *newdentry;
struct dentry *trap;
bool old_opaque;
bool new_opaque;
bool new_create = false;
bool cleanup_whiteout = false;
bool overwrite = !(flags & RENAME_EXCHANGE);
bool is_dir = d_is_dir(old);
bool new_is_dir = false;
struct dentry *opaquedir = NULL;
const struct cred *old_cred = NULL;
struct cred *override_cred = NULL;
err = -EINVAL;
if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE))
goto out;
flags &= ~RENAME_NOREPLACE;
err = ovl_check_sticky(old);
if (err)
goto out;
/* Don't copy up directory trees */
old_type = ovl_path_type(old);
err = -EXDEV;
if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
goto out;
if (new->d_inode) {
err = ovl_check_sticky(new);
if (err)
goto out;
if (d_is_dir(new))
new_is_dir = true;
new_type = ovl_path_type(new);
err = -EXDEV;
if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
goto out;
err = 0;
if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_lower(old)->d_inode ==
ovl_dentry_lower(new)->d_inode)
goto out;
}
if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
if (ovl_dentry_upper(old)->d_inode ==
ovl_dentry_upper(new)->d_inode)
goto out;
}
} else {
if (ovl_dentry_is_opaque(new))
new_type = __OVL_PATH_UPPER;
else
new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
}
err = ovl_want_write(old);
if (err)
goto out;
err = ovl_copy_up(old);
if (err)
goto out_drop_write;
err = ovl_copy_up(new->d_parent);
if (err)
goto out_drop_write;
if (!overwrite) {
err = ovl_copy_up(new);
if (err)
goto out_drop_write;
}
old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
if (old_opaque || new_opaque) {
err = -ENOMEM;
override_cred = prepare_creds();
if (!override_cred)
goto out_drop_write;
/*
* CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir
* CAP_DAC_OVERRIDE for create in workdir
* CAP_FOWNER for removing whiteout from sticky dir
* CAP_FSETID for chmod of opaque dir
* CAP_CHOWN for chown of opaque dir
*/
cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN);
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
cap_raise(override_cred->cap_effective, CAP_FOWNER);
cap_raise(override_cred->cap_effective, CAP_FSETID);
cap_raise(override_cred->cap_effective, CAP_CHOWN);
old_cred = override_creds(override_cred);
}
if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
opaquedir = ovl_check_empty_and_clear(new);
err = PTR_ERR(opaquedir);
if (IS_ERR(opaquedir)) {
opaquedir = NULL;
goto out_revert_creds;
}
}
if (overwrite) {
if (old_opaque) {
if (new->d_inode || !new_opaque) {
/* Whiteout source */
flags |= RENAME_WHITEOUT;
} else {
/* Switch whiteouts */
flags |= RENAME_EXCHANGE;
}
} else if (is_dir && !new->d_inode && new_opaque) {
flags |= RENAME_EXCHANGE;
cleanup_whiteout = true;
}
}
old_upperdir = ovl_dentry_upper(old->d_parent);
new_upperdir = ovl_dentry_upper(new->d_parent);
trap = lock_rename(new_upperdir, old_upperdir);
olddentry = ovl_dentry_upper(old);
newdentry = ovl_dentry_upper(new);
if (newdentry) {
if (opaquedir) {
newdentry = opaquedir;
opaquedir = NULL;
} else {
dget(newdentry);
}
} else {
new_create = true;
newdentry = lookup_one_len(new->d_name.name, new_upperdir,
new->d_name.len);
err = PTR_ERR(newdentry);
if (IS_ERR(newdentry))
goto out_unlock;
}
err = -ESTALE;
if (olddentry->d_parent != old_upperdir)
goto out_dput;
if (newdentry->d_parent != new_upperdir)
goto out_dput;
if (olddentry == trap)
goto out_dput;
if (newdentry == trap)
goto out_dput;
if (is_dir && !old_opaque && new_opaque) {
err = ovl_set_opaque(olddentry);
if (err)
goto out_dput;
}
if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
err = ovl_set_opaque(newdentry);
if (err)
goto out_dput;
}
if (old_opaque || new_opaque) {
err = ovl_do_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
flags);
} else {
/* No debug for the plain case */
BUG_ON(flags & ~RENAME_EXCHANGE);
err = vfs_rename(old_upperdir->d_inode, olddentry,
new_upperdir->d_inode, newdentry,
NULL, flags);
}
if (err) {
if (is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(newdentry);
goto out_dput;
}
if (is_dir && old_opaque && !new_opaque)
ovl_remove_opaque(olddentry);
if (!overwrite && new_is_dir && !old_opaque && new_opaque)
ovl_remove_opaque(newdentry);
if (old_opaque != new_opaque) {
ovl_dentry_set_opaque(old, new_opaque);
if (!overwrite)
ovl_dentry_set_opaque(new, old_opaque);
}
if (cleanup_whiteout)
ovl_cleanup(old_upperdir->d_inode, newdentry);
ovl_dentry_version_inc(old->d_parent);
ovl_dentry_version_inc(new->d_parent);
out_dput:
dput(newdentry);
out_unlock:
unlock_rename(new_upperdir, old_upperdir);
out_revert_creds:
if (old_opaque || new_opaque) {
revert_creds(old_cred);
put_cred(override_cred);
}
out_drop_write:
ovl_drop_write(old);
out:
dput(opaquedir);
return err;
}
const struct inode_operations ovl_dir_inode_operations = {
.lookup = ovl_lookup,
.mkdir = ovl_mkdir,
.symlink = ovl_symlink,
.unlink = ovl_unlink,
.rmdir = ovl_rmdir,
.rename2 = ovl_rename2,
.link = ovl_link,
.setattr = ovl_setattr,
.create = ovl_create,
.mknod = ovl_mknod,
.permission = ovl_permission,
.getattr = ovl_dir_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};

View File

@ -1,438 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/xattr.h>
#include "overlayfs.h"
static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr,
bool no_data)
{
int err;
struct dentry *parent;
struct kstat stat;
struct path lowerpath;
parent = dget_parent(dentry);
err = ovl_copy_up(parent);
if (err)
goto out_dput_parent;
ovl_path_lower(dentry, &lowerpath);
err = vfs_getattr(&lowerpath, &stat);
if (err)
goto out_dput_parent;
if (no_data)
stat.size = 0;
err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr);
out_dput_parent:
dput(parent);
return err;
}
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
upperdentry = ovl_dentry_upper(dentry);
if (upperdentry) {
mutex_lock(&upperdentry->d_inode->i_mutex);
err = notify_change(upperdentry, attr, NULL);
mutex_unlock(&upperdentry->d_inode->i_mutex);
} else {
err = ovl_copy_up_last(dentry, attr, false);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry,
struct kstat *stat)
{
struct path realpath;
ovl_path_real(dentry, &realpath);
return vfs_getattr(&realpath, stat);
}
int ovl_permission(struct inode *inode, int mask)
{
struct ovl_entry *oe;
struct dentry *alias = NULL;
struct inode *realinode;
struct dentry *realdentry;
bool is_upper;
int err;
if (S_ISDIR(inode->i_mode)) {
oe = inode->i_private;
} else if (mask & MAY_NOT_BLOCK) {
return -ECHILD;
} else {
/*
* For non-directories find an alias and get the info
* from there.
*/
alias = d_find_any_alias(inode);
if (WARN_ON(!alias))
return -ENOENT;
oe = alias->d_fsdata;
}
realdentry = ovl_entry_real(oe, &is_upper);
/* Careful in RCU walk mode */
realinode = ACCESS_ONCE(realdentry->d_inode);
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
err = -ENOENT;
goto out_dput;
}
if (mask & MAY_WRITE) {
umode_t mode = realinode->i_mode;
/*
* Writes will always be redirected to upper layer, so
* ignore lower layer being read-only.
*
* If the overlay itself is read-only then proceed
* with the permission check, don't return EROFS.
* This will only happen if this is the lower layer of
* another overlayfs.
*
* If upper fs becomes read-only after the overlay was
* constructed return EROFS to prevent modification of
* upper layer.
*/
err = -EROFS;
if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) &&
(S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)))
goto out_dput;
}
err = __inode_permission(realinode, mask);
out_dput:
dput(alias);
return err;
}
struct ovl_link_data {
struct dentry *realdentry;
void *cookie;
};
static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd)
{
void *ret;
struct dentry *realdentry;
struct inode *realinode;
realdentry = ovl_dentry_real(dentry);
realinode = realdentry->d_inode;
if (WARN_ON(!realinode->i_op->follow_link))
return ERR_PTR(-EPERM);
ret = realinode->i_op->follow_link(realdentry, nd);
if (IS_ERR(ret))
return ret;
if (realinode->i_op->put_link) {
struct ovl_link_data *data;
data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
if (!data) {
realinode->i_op->put_link(realdentry, nd, ret);
return ERR_PTR(-ENOMEM);
}
data->realdentry = realdentry;
data->cookie = ret;
return data;
} else {
return NULL;
}
}
static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
{
struct inode *realinode;
struct ovl_link_data *data = c;
if (!data)
return;
realinode = data->realdentry->d_inode;
realinode->i_op->put_link(data->realdentry, nd, data->cookie);
kfree(data);
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PRE_NAME, OVL_XATTR_PRE_LEN) == 0;
}
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry;
err = ovl_want_write(dentry);
if (err)
goto out;
err = -EPERM;
if (ovl_is_private_xattr(name))
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
upperdentry = ovl_dentry_upper(dentry);
err = vfs_setxattr(upperdentry, name, value, size, flags);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_need_xattr_filter(struct dentry *dentry,
enum ovl_path_type type)
{
if ((type & (__OVL_PATH_PURE | __OVL_PATH_UPPER)) == __OVL_PATH_UPPER)
return S_ISDIR(dentry->d_inode->i_mode);
else
return false;
}
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
return -ENODATA;
return vfs_getxattr(realpath.dentry, name, value, size);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
ssize_t res;
int off;
res = vfs_listxattr(realpath.dentry, list, size);
if (res <= 0 || size == 0)
return res;
if (!ovl_need_xattr_filter(dentry, type))
return res;
/* filter out private xattrs */
for (off = 0; off < res;) {
char *s = list + off;
size_t slen = strlen(s) + 1;
BUG_ON(off + slen > res);
if (ovl_is_private_xattr(s)) {
res -= slen;
memmove(s, s + slen, res - off);
} else {
off += slen;
}
}
return res;
}
int ovl_removexattr(struct dentry *dentry, const char *name)
{
int err;
struct path realpath;
enum ovl_path_type type = ovl_path_real(dentry, &realpath);
err = ovl_want_write(dentry);
if (err)
goto out;
err = -ENODATA;
if (ovl_need_xattr_filter(dentry, type) && ovl_is_private_xattr(name))
goto out_drop_write;
if (!OVL_TYPE_UPPER(type)) {
err = vfs_getxattr(realpath.dentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
err = vfs_removexattr(realpath.dentry, name);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type,
struct dentry *realdentry)
{
if (OVL_TYPE_UPPER(type))
return false;
if (special_file(realdentry->d_inode->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
static int ovl_dentry_open(struct dentry *dentry, struct file *file,
const struct cred *cred)
{
int err;
struct path realpath;
enum ovl_path_type type;
bool want_write = false;
type = ovl_path_real(dentry, &realpath);
if (!ovl_is_nocopyupw(dentry)) {
if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) {
want_write = true;
err = ovl_want_write(dentry);
if (err)
goto out;
if (file->f_flags & O_TRUNC)
err = ovl_copy_up_last(dentry, NULL, true);
else
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
ovl_path_upper(dentry, &realpath);
}
}
err = vfs_open(&realpath, file, cred);
out_drop_write:
if (want_write)
ovl_drop_write(dentry);
out:
return err;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
.dentry_open = ovl_dentry_open,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.follow_link = ovl_follow_link,
.put_link = ovl_put_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.setxattr = ovl_setxattr,
.getxattr = ovl_getxattr,
.listxattr = ovl_listxattr,
.removexattr = ovl_removexattr,
};
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe)
{
struct inode *inode;
inode = new_inode(sb);
if (!inode)
return NULL;
mode &= S_IFMT;
inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_flags |= S_NOATIME | S_NOCMTIME;
switch (mode) {
case S_IFDIR:
inode->i_private = oe;
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
case S_IFREG:
case S_IFSOCK:
case S_IFBLK:
case S_IFCHR:
case S_IFIFO:
inode->i_op = &ovl_file_inode_operations;
break;
default:
WARN(1, "illegal file type: %i\n", mode);
iput(inode);
inode = NULL;
}
return inode;
}

View File

@ -1,200 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/kernel.h>
struct ovl_entry;
enum ovl_path_type {
__OVL_PATH_PURE = (1 << 0),
__OVL_PATH_UPPER = (1 << 1),
__OVL_PATH_MERGE = (1 << 2),
};
#define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER)
#define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE)
#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
#define OVL_TYPE_MERGE_OR_LOWER(type) \
(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
#define OVL_XATTR_PRE_NAME "trusted.overlay."
#define OVL_XATTR_PRE_LEN 16
#define OVL_XATTR_OPAQUE OVL_XATTR_PRE_NAME"opaque"
static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry)
{
int err = vfs_rmdir(dir, dentry);
pr_debug("rmdir(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry)
{
int err = vfs_unlink(dir, dentry, NULL);
pr_debug("unlink(%pd2) = %i\n", dentry, err);
return err;
}
static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool debug)
{
int err = vfs_link(old_dentry, dir, new_dentry, NULL);
if (debug) {
pr_debug("link(%pd2, %pd2) = %i\n",
old_dentry, new_dentry, err);
}
return err;
}
static inline int ovl_do_create(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_create(dir, dentry, mode, true);
if (debug)
pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry,
umode_t mode, bool debug)
{
int err = vfs_mkdir(dir, dentry, mode);
if (debug)
pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err);
return err;
}
static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry,
umode_t mode, dev_t dev, bool debug)
{
int err = vfs_mknod(dir, dentry, mode, dev);
if (debug) {
pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n",
dentry, mode, dev, err);
}
return err;
}
static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry,
const char *oldname, bool debug)
{
int err = vfs_symlink(dir, dentry, oldname);
if (debug)
pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err);
return err;
}
static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
dentry, name, (int) size, (char *) value, flags, err);
return err;
}
static inline int ovl_do_removexattr(struct dentry *dentry, const char *name)
{
int err = vfs_removexattr(dentry, name);
pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err);
return err;
}
static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry,
struct inode *newdir, struct dentry *newdentry,
unsigned int flags)
{
int err;
pr_debug("rename2(%pd2, %pd2, 0x%x)\n",
olddentry, newdentry, flags);
err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags);
if (err) {
pr_debug("...rename2(%pd2, %pd2, ...) = %i\n",
olddentry, newdentry, err);
}
return err;
}
static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry)
{
int err = vfs_whiteout(dir, dentry);
pr_debug("whiteout(%pd2) = %i\n", dentry, err);
return err;
}
bool ovl_is_nocopyupw(struct dentry *dentry);
enum ovl_path_type ovl_path_type(struct dentry *dentry);
u64 ovl_dentry_version_get(struct dentry *dentry);
void ovl_dentry_version_inc(struct dentry *dentry);
void ovl_path_upper(struct dentry *dentry, struct path *path);
void ovl_path_lower(struct dentry *dentry, struct path *path);
enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
struct dentry *ovl_dentry_upper(struct dentry *dentry);
struct dentry *ovl_dentry_lower(struct dentry *dentry);
struct dentry *ovl_dentry_real(struct dentry *dentry);
struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
struct dentry *ovl_workdir(struct dentry *dentry);
int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
bool ovl_is_whiteout(struct dentry *dentry);
void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
struct file *ovl_path_open(struct path *path, int flags);
struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
struct kstat *stat, const char *link);
/* readdir.c */
extern const struct file_operations ovl_dir_operations;
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list);
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list);
void ovl_cache_free(struct list_head *list);
/* inode.c */
int ovl_setattr(struct dentry *dentry, struct iattr *attr);
int ovl_permission(struct inode *inode, int mask);
int ovl_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags);
ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
void *value, size_t size);
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
int ovl_removexattr(struct dentry *dentry, const char *name);
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode,
struct ovl_entry *oe);
static inline void ovl_copyattr(struct inode *from, struct inode *to)
{
to->i_uid = from->i_uid;
to->i_gid = from->i_gid;
}
/* dir.c */
extern const struct inode_operations ovl_dir_inode_operations;
struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
int ovl_create_real(struct inode *dir, struct dentry *newdentry,
struct kstat *stat, const char *link,
struct dentry *hardlink, bool debug);
void ovl_cleanup(struct inode *dir, struct dentry *dentry);
/* copy_up.c */
int ovl_copy_up(struct dentry *dentry);
int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
struct path *lowerpath, struct kstat *stat,
struct iattr *attr);
int ovl_copy_xattr(struct dentry *old, struct dentry *new);
int ovl_set_attr(struct dentry *upper, struct kstat *stat);

View File

@ -1,557 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/namei.h>
#include <linux/file.h>
#include <linux/xattr.h>
#include <linux/rbtree.h>
#include <linux/security.h>
#include <linux/cred.h>
#include "overlayfs.h"
struct ovl_cache_entry {
unsigned int len;
unsigned int type;
u64 ino;
struct list_head l_node;
struct rb_node node;
bool is_whiteout;
char name[];
};
struct ovl_dir_cache {
long refcount;
u64 version;
struct list_head entries;
};
struct ovl_readdir_data {
struct dir_context ctx;
bool is_merge;
struct rb_root root;
struct list_head *list;
struct list_head middle;
struct dentry *dir;
int count;
int err;
};
struct ovl_dir_file {
bool is_real;
bool is_upper;
struct ovl_dir_cache *cache;
struct list_head *cursor;
struct file *realfile;
struct file *upperfile;
};
static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n)
{
return container_of(n, struct ovl_cache_entry, node);
}
static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root,
const char *name, int len)
{
struct rb_node *node = root->rb_node;
int cmp;
while (node) {
struct ovl_cache_entry *p = ovl_cache_entry_from_node(node);
cmp = strncmp(name, p->name, len);
if (cmp > 0)
node = p->node.rb_right;
else if (cmp < 0 || len < p->len)
node = p->node.rb_left;
else
return p;
}
return NULL;
}
static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir,
const char *name, int len,
u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
size_t size = offsetof(struct ovl_cache_entry, name[len + 1]);
p = kmalloc(size, GFP_KERNEL);
if (!p)
return NULL;
memcpy(p->name, name, len);
p->name[len] = '\0';
p->len = len;
p->type = d_type;
p->ino = ino;
p->is_whiteout = false;
if (d_type == DT_CHR) {
struct dentry *dentry;
const struct cred *old_cred;
struct cred *override_cred;
override_cred = prepare_creds();
if (!override_cred) {
kfree(p);
return NULL;
}
/*
* CAP_DAC_OVERRIDE for lookup
*/
cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
old_cred = override_creds(override_cred);
dentry = lookup_one_len(name, dir, len);
if (!IS_ERR(dentry)) {
p->is_whiteout = ovl_is_whiteout(dentry);
dput(dentry);
}
revert_creds(old_cred);
put_cred(override_cred);
}
return p;
}
static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd,
const char *name, int len, u64 ino,
unsigned int d_type)
{
struct rb_node **newp = &rdd->root.rb_node;
struct rb_node *parent = NULL;
struct ovl_cache_entry *p;
while (*newp) {
int cmp;
struct ovl_cache_entry *tmp;
parent = *newp;
tmp = ovl_cache_entry_from_node(*newp);
cmp = strncmp(name, tmp->name, len);
if (cmp > 0)
newp = &tmp->node.rb_right;
else if (cmp < 0 || len < tmp->len)
newp = &tmp->node.rb_left;
else
return 0;
}
p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type);
if (p == NULL)
return -ENOMEM;
list_add_tail(&p->l_node, rdd->list);
rb_link_node(&p->node, parent, newp);
rb_insert_color(&p->node, &rdd->root);
return 0;
}
static int ovl_fill_lower(struct ovl_readdir_data *rdd,
const char *name, int namelen,
loff_t offset, u64 ino, unsigned int d_type)
{
struct ovl_cache_entry *p;
p = ovl_cache_entry_find(&rdd->root, name, namelen);
if (p) {
list_move_tail(&p->l_node, &rdd->middle);
} else {
p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type);
if (p == NULL)
rdd->err = -ENOMEM;
else
list_add_tail(&p->l_node, &rdd->middle);
}
return rdd->err;
}
void ovl_cache_free(struct list_head *list)
{
struct ovl_cache_entry *p;
struct ovl_cache_entry *n;
list_for_each_entry_safe(p, n, list, l_node)
kfree(p);
INIT_LIST_HEAD(list);
}
static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry)
{
struct ovl_dir_cache *cache = od->cache;
WARN_ON(cache->refcount <= 0);
cache->refcount--;
if (!cache->refcount) {
if (ovl_dir_cache(dentry) == cache)
ovl_set_dir_cache(dentry, NULL);
ovl_cache_free(&cache->entries);
kfree(cache);
}
}
static int ovl_fill_merge(struct dir_context *ctx, const char *name,
int namelen, loff_t offset, u64 ino,
unsigned int d_type)
{
struct ovl_readdir_data *rdd =
container_of(ctx, struct ovl_readdir_data, ctx);
rdd->count++;
if (!rdd->is_merge)
return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type);
else
return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type);
}
static inline int ovl_dir_read(struct path *realpath,
struct ovl_readdir_data *rdd)
{
struct file *realfile;
int err;
realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY);
if (IS_ERR(realfile))
return PTR_ERR(realfile);
rdd->dir = realpath->dentry;
rdd->ctx.pos = 0;
do {
rdd->count = 0;
rdd->err = 0;
err = iterate_dir(realfile, &rdd->ctx);
if (err >= 0)
err = rdd->err;
} while (!err && rdd->count);
fput(realfile);
return err;
}
static void ovl_dir_reset(struct file *file)
{
struct ovl_dir_file *od = file->private_data;
struct ovl_dir_cache *cache = od->cache;
struct dentry *dentry = file->f_path.dentry;
enum ovl_path_type type = ovl_path_type(dentry);
if (cache && ovl_dentry_version_get(dentry) != cache->version) {
ovl_cache_put(od, dentry);
od->cache = NULL;
od->cursor = NULL;
}
WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type));
if (od->is_real && OVL_TYPE_MERGE(type))
od->is_real = false;
}
static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list)
{
int err;
struct path realpath;
struct ovl_readdir_data rdd = {
.ctx.actor = ovl_fill_merge,
.list = list,
.root = RB_ROOT,
.is_merge = false,
};
int idx, next;
for (idx = 0; idx != -1; idx = next) {
next = ovl_path_next(idx, dentry, &realpath);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
if (err)
break;
} else {
/*
* Insert lowest layer entries before upper ones, this
* allows offsets to be reasonably constant
*/
list_add(&rdd.middle, rdd.list);
rdd.is_merge = true;
err = ovl_dir_read(&realpath, &rdd);
list_del(&rdd.middle);
}
}
return err;
}
static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos)
{
struct list_head *p;
loff_t off = 0;
list_for_each(p, &od->cache->entries) {
if (off >= pos)
break;
off++;
}
/* Cursor is safe since the cache is stable */
od->cursor = p;
}
static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry)
{
int res;
struct ovl_dir_cache *cache;
cache = ovl_dir_cache(dentry);
if (cache && ovl_dentry_version_get(dentry) == cache->version) {
cache->refcount++;
return cache;
}
ovl_set_dir_cache(dentry, NULL);
cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL);
if (!cache)
return ERR_PTR(-ENOMEM);
cache->refcount = 1;
INIT_LIST_HEAD(&cache->entries);
res = ovl_dir_read_merged(dentry, &cache->entries);
if (res) {
ovl_cache_free(&cache->entries);
kfree(cache);
return ERR_PTR(res);
}
cache->version = ovl_dentry_version_get(dentry);
ovl_set_dir_cache(dentry, cache);
return cache;
}
static int ovl_iterate(struct file *file, struct dir_context *ctx)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct ovl_cache_entry *p;
if (!ctx->pos)
ovl_dir_reset(file);
if (od->is_real)
return iterate_dir(od->realfile, ctx);
if (!od->cache) {
struct ovl_dir_cache *cache;
cache = ovl_cache_get(dentry);
if (IS_ERR(cache))
return PTR_ERR(cache);
od->cache = cache;
ovl_seek_cursor(od, ctx->pos);
}
while (od->cursor != &od->cache->entries) {
p = list_entry(od->cursor, struct ovl_cache_entry, l_node);
if (!p->is_whiteout)
if (!dir_emit(ctx, p->name, p->len, p->ino, p->type))
break;
od->cursor = p->l_node.next;
ctx->pos++;
}
return 0;
}
static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
{
loff_t res;
struct ovl_dir_file *od = file->private_data;
mutex_lock(&file_inode(file)->i_mutex);
if (!file->f_pos)
ovl_dir_reset(file);
if (od->is_real) {
res = vfs_llseek(od->realfile, offset, origin);
file->f_pos = od->realfile->f_pos;
} else {
res = -EINVAL;
switch (origin) {
case SEEK_CUR:
offset += file->f_pos;
break;
case SEEK_SET:
break;
default:
goto out_unlock;
}
if (offset < 0)
goto out_unlock;
if (offset != file->f_pos) {
file->f_pos = offset;
if (od->cache)
ovl_seek_cursor(od, offset);
}
res = offset;
}
out_unlock:
mutex_unlock(&file_inode(file)->i_mutex);
return res;
}
static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct ovl_dir_file *od = file->private_data;
struct dentry *dentry = file->f_path.dentry;
struct file *realfile = od->realfile;
/*
* Need to check if we started out being a lower dir, but got copied up
*/
if (!od->is_upper && OVL_TYPE_UPPER(ovl_path_type(dentry))) {
struct inode *inode = file_inode(file);
realfile = lockless_dereference(od->upperfile);
if (!realfile) {
struct path upperpath;
ovl_path_upper(dentry, &upperpath);
realfile = ovl_path_open(&upperpath, O_RDONLY);
smp_mb__before_spinlock();
mutex_lock(&inode->i_mutex);
if (!od->upperfile) {
if (IS_ERR(realfile)) {
mutex_unlock(&inode->i_mutex);
return PTR_ERR(realfile);
}
od->upperfile = realfile;
} else {
/* somebody has beaten us to it */
if (!IS_ERR(realfile))
fput(realfile);
realfile = od->upperfile;
}
mutex_unlock(&inode->i_mutex);
}
}
return vfs_fsync_range(realfile, start, end, datasync);
}
static int ovl_dir_release(struct inode *inode, struct file *file)
{
struct ovl_dir_file *od = file->private_data;
if (od->cache) {
mutex_lock(&inode->i_mutex);
ovl_cache_put(od, file->f_path.dentry);
mutex_unlock(&inode->i_mutex);
}
fput(od->realfile);
if (od->upperfile)
fput(od->upperfile);
kfree(od);
return 0;
}
static int ovl_dir_open(struct inode *inode, struct file *file)
{
struct path realpath;
struct file *realfile;
struct ovl_dir_file *od;
enum ovl_path_type type;
od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL);
if (!od)
return -ENOMEM;
type = ovl_path_real(file->f_path.dentry, &realpath);
realfile = ovl_path_open(&realpath, file->f_flags);
if (IS_ERR(realfile)) {
kfree(od);
return PTR_ERR(realfile);
}
od->realfile = realfile;
od->is_real = !OVL_TYPE_MERGE(type);
od->is_upper = OVL_TYPE_UPPER(type);
file->private_data = od;
return 0;
}
const struct file_operations ovl_dir_operations = {
.read = generic_read_dir,
.open = ovl_dir_open,
.iterate = ovl_iterate,
.llseek = ovl_dir_llseek,
.fsync = ovl_dir_fsync,
.release = ovl_dir_release,
};
int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list)
{
int err;
struct ovl_cache_entry *p;
err = ovl_dir_read_merged(dentry, list);
if (err)
return err;
err = 0;
list_for_each_entry(p, list, l_node) {
if (p->is_whiteout)
continue;
if (p->name[0] == '.') {
if (p->len == 1)
continue;
if (p->len == 2 && p->name[1] == '.')
continue;
}
err = -ENOTEMPTY;
break;
}
return err;
}
void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
{
struct ovl_cache_entry *p;
mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
list_for_each_entry(p, list, l_node) {
struct dentry *dentry;
if (!p->is_whiteout)
continue;
dentry = lookup_one_len(p->name, upper, p->len);
if (IS_ERR(dentry)) {
pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n",
upper->d_name.name, p->len, p->name,
(int) PTR_ERR(dentry));
continue;
}
ovl_cleanup(upper->d_inode, dentry);
dput(dentry);
}
mutex_unlock(&upper->d_inode->i_mutex);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +0,0 @@
kmod(mcoverlay
SOURCES
copy_up.c dir.c inode.c readdir.c super.c namei.c util.c export.c
INSTALL_DEST
${KMODDIR}
)

View File

@ -1,804 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/splice.h>
#include <linux/xattr.h>
#include <linux/security.h>
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/cred.h>
#include <linux/namei.h>
#include <linux/fdtable.h>
#include <linux/ratelimit.h>
#include <linux/exportfs.h>
#include "overlayfs.h"
#define OVL_COPY_UP_CHUNK_SIZE (1 << 20)
static bool __read_mostly ovl_check_copy_up;
module_param_named(check_copy_up, ovl_check_copy_up, bool,
S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(ovl_check_copy_up,
"Warn on copy-up when causing process also has a R/O fd open");
static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
{
const struct dentry *dentry = data;
if (file_inode(f) == d_inode(dentry))
pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
f, fd, current->pid, current->comm);
return 0;
}
/*
* Check the fds open by this process and warn if something like the following
* scenario is about to occur:
*
* fd1 = open("foo", O_RDONLY);
* fd2 = open("foo", O_RDWR);
*/
static void ovl_do_check_copy_up(struct dentry *dentry)
{
if (ovl_check_copy_up)
iterate_fd(current->files, 0, ovl_check_fd, dentry);
}
int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
int uninitialized_var(error);
size_t slen;
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
!(new->d_inode->i_opflags & IOP_XATTR))
return 0;
list_size = vfs_listxattr(old, NULL, 0);
if (list_size <= 0) {
if (list_size == -EOPNOTSUPP)
return 0;
return list_size;
}
buf = kzalloc(list_size, GFP_KERNEL);
if (!buf)
return -ENOMEM;
list_size = vfs_listxattr(old, buf, list_size);
if (list_size <= 0) {
error = list_size;
goto out;
}
for (name = buf; list_size; name += slen) {
slen = strnlen(name, list_size) + 1;
/* underlying fs providing us with an broken xattr list? */
if (WARN_ON(slen > list_size)) {
error = -EIO;
break;
}
list_size -= slen;
if (ovl_is_private_xattr(name))
continue;
retry:
size = vfs_getxattr(old, name, value, value_size);
if (size == -ERANGE)
size = vfs_getxattr(old, name, NULL, 0);
if (size < 0) {
/* NOFSCHECK */
continue;
error = size;
break;
}
if (size > value_size) {
void *new;
new = krealloc(value, size, GFP_KERNEL);
if (!new) {
error = -ENOMEM;
break;
}
value = new;
value_size = size;
goto retry;
}
error = security_inode_copy_up_xattr(name);
if (error < 0 && error != -EOPNOTSUPP)
break;
if (error == 1) {
error = 0;
continue; /* Discard */
}
error = vfs_setxattr(new, name, value, size, 0);
if (error)
break;
}
kfree(value);
out:
kfree(buf);
return error;
}
static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
{
struct file *old_file;
struct file *new_file;
loff_t old_pos = 0;
loff_t new_pos = 0;
int error = 0;
if (len == 0)
return 0;
old_file = ovl_path_open(old, O_LARGEFILE | O_RDONLY);
if (IS_ERR(old_file))
return PTR_ERR(old_file);
new_file = ovl_path_open(new, O_LARGEFILE | O_WRONLY);
if (IS_ERR(new_file)) {
error = PTR_ERR(new_file);
goto out_fput;
}
/* Try to use clone_file_range to clone up within the same fs */
error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
if (!error)
goto out;
/* Couldn't clone, so now we try to copy the data */
error = 0;
/* FIXME: copy up sparse files efficiently */
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
long bytes;
if (len < this_len)
this_len = len;
if (signal_pending_state(TASK_KILLABLE, current)) {
error = -EINTR;
break;
}
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
if (bytes <= 0) {
error = bytes;
break;
}
WARN_ON(old_pos != new_pos);
len -= bytes;
}
out:
if (!error)
error = vfs_fsync(new_file, 0);
fput(new_file);
out_fput:
fput(old_file);
return error;
}
static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
return notify_change(upperdentry, &attr, NULL);
}
int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat)
{
int err = 0;
if (!S_ISLNK(stat->mode)) {
struct iattr attr = {
.ia_valid = ATTR_MODE,
.ia_mode = stat->mode,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err) {
struct iattr attr = {
.ia_valid = ATTR_UID | ATTR_GID,
.ia_uid = stat->uid,
.ia_gid = stat->gid,
};
err = notify_change(upperdentry, &attr, NULL);
}
if (!err)
ovl_set_timestamps(upperdentry, stat);
return err;
}
struct ovl_fh *ovl_encode_real_fh(struct dentry *real, bool is_upper)
{
struct ovl_fh *fh;
int fh_type, fh_len, dwords;
void *buf;
int buflen = MAX_HANDLE_SZ;
uuid_t *uuid = &real->d_sb->s_uuid;
buf = kmalloc(buflen, GFP_KERNEL);
if (!buf)
return ERR_PTR(-ENOMEM);
/*
* We encode a non-connectable file handle for non-dir, because we
* only need to find the lower inode number and we don't want to pay
* the price or reconnecting the dentry.
*/
dwords = buflen >> 2;
fh_type = exportfs_encode_fh(real, buf, &dwords, 0);
buflen = (dwords << 2);
fh = ERR_PTR(-EIO);
if (WARN_ON(fh_type < 0) ||
WARN_ON(buflen > MAX_HANDLE_SZ) ||
WARN_ON(fh_type == FILEID_INVALID))
goto out;
BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255);
fh_len = offsetof(struct ovl_fh, fid) + buflen;
fh = kmalloc(fh_len, GFP_KERNEL);
if (!fh) {
fh = ERR_PTR(-ENOMEM);
goto out;
}
fh->version = OVL_FH_VERSION;
fh->magic = OVL_FH_MAGIC;
fh->type = fh_type;
fh->flags = OVL_FH_FLAG_CPU_ENDIAN;
/*
* When we will want to decode an overlay dentry from this handle
* and all layers are on the same fs, if we get a disconncted real
* dentry when we decode fid, the only way to tell if we should assign
* it to upperdentry or to lowerstack is by checking this flag.
*/
if (is_upper)
fh->flags |= OVL_FH_FLAG_PATH_UPPER;
fh->len = fh_len;
fh->uuid = *uuid;
memcpy(fh->fid, buf, buflen);
out:
kfree(buf);
return fh;
}
int ovl_set_origin(struct dentry *dentry, struct dentry *lower,
struct dentry *upper)
{
const struct ovl_fh *fh = NULL;
int err;
/*
* When lower layer doesn't support export operations store a 'null' fh,
* so we can use the overlay.origin xattr to distignuish between a copy
* up and a pure upper inode.
*/
if (ovl_can_decode_fh(lower->d_sb)) {
fh = ovl_encode_real_fh(lower, false);
if (IS_ERR(fh))
return PTR_ERR(fh);
}
/*
* Do not fail when upper doesn't support xattrs.
*/
err = ovl_check_setxattr(dentry, upper, OVL_XATTR_ORIGIN, fh,
fh ? fh->len : 0, 0);
kfree(fh);
return err;
}
/* Store file handle of @upper dir in @index dir entry */
static int ovl_set_upper_fh(struct dentry *upper, struct dentry *index)
{
const struct ovl_fh *fh;
int err;
fh = ovl_encode_real_fh(upper, true);
if (IS_ERR(fh))
return PTR_ERR(fh);
err = ovl_do_setxattr(index, OVL_XATTR_UPPER, fh, fh->len, 0);
kfree(fh);
return err;
}
/*
* Create and install index entry.
*
* Caller must hold i_mutex on indexdir.
*/
static int ovl_create_index(struct dentry *dentry, struct dentry *origin,
struct dentry *upper)
{
struct dentry *indexdir = ovl_indexdir(dentry->d_sb);
struct inode *dir = d_inode(indexdir);
struct dentry *index = NULL;
struct dentry *temp = NULL;
struct qstr name = { };
int err;
/*
* For now this is only used for creating index entry for directories,
* because non-dir are copied up directly to index and then hardlinked
* to upper dir.
*
* TODO: implement create index for non-dir, so we can call it when
* encoding file handle for non-dir in case index does not exist.
*/
if (WARN_ON(!d_is_dir(dentry)))
return -EIO;
/* Directory not expected to be indexed before copy up */
if (WARN_ON(ovl_test_flag(OVL_INDEX, d_inode(dentry))))
return -EIO;
err = ovl_get_index_name(origin, &name);
if (err)
return err;
temp = ovl_create_temp(indexdir, OVL_CATTR(S_IFDIR | 0));
err = PTR_ERR(temp);
if (IS_ERR(temp))
goto free_name;
err = ovl_set_upper_fh(upper, temp);
if (err)
goto out;
index = lookup_one_len(name.name, indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
} else {
err = ovl_do_rename(dir, temp, dir, index, 0);
dput(index);
}
out:
if (err)
ovl_cleanup(dir, temp);
dput(temp);
free_name:
kfree(name.name);
return err;
}
struct ovl_copy_up_ctx {
struct dentry *parent;
struct dentry *dentry;
struct path lowerpath;
struct kstat stat;
struct kstat pstat;
const char *link;
struct dentry *destdir;
struct qstr destname;
struct dentry *workdir;
bool tmpfile;
bool origin;
bool indexed;
};
static int ovl_link_up(struct ovl_copy_up_ctx *c)
{
int err;
struct dentry *upper;
struct dentry *upperdir = ovl_dentry_upper(c->parent);
struct inode *udir = d_inode(upperdir);
/* Mark parent "impure" because it may now contain non-pure upper */
err = ovl_set_impure(c->parent, upperdir);
if (err)
return err;
err = ovl_set_nlink_lower(c->dentry);
if (err)
return err;
inode_lock_nested(udir, I_MUTEX_PARENT);
upper = lookup_one_len(c->dentry->d_name.name, upperdir,
c->dentry->d_name.len);
err = PTR_ERR(upper);
if (!IS_ERR(upper)) {
err = ovl_do_link(ovl_dentry_upper(c->dentry), udir, upper);
dput(upper);
if (!err) {
/* Restore timestamps on parent (best effort) */
ovl_set_timestamps(upperdir, &c->pstat);
ovl_dentry_set_upper_alias(c->dentry);
}
}
inode_unlock(udir);
if (err)
return err;
err = ovl_set_nlink_upper(c->dentry);
return err;
}
static int ovl_install_temp(struct ovl_copy_up_ctx *c, struct dentry *temp,
struct dentry **newdentry)
{
int err;
struct dentry *upper;
struct inode *udir = d_inode(c->destdir);
upper = lookup_one_len(c->destname.name, c->destdir, c->destname.len);
if (IS_ERR(upper))
return PTR_ERR(upper);
if (c->tmpfile)
err = ovl_do_link(temp, udir, upper);
else
err = ovl_do_rename(d_inode(c->workdir), temp, udir, upper, 0);
if (!err)
*newdentry = dget(c->tmpfile ? upper : temp);
dput(upper);
return err;
}
static struct dentry *ovl_get_tmpfile(struct ovl_copy_up_ctx *c)
{
int err;
struct dentry *temp;
const struct cred *old_creds = NULL;
struct cred *new_creds = NULL;
struct ovl_cattr cattr = {
/* Can't properly set mode on creation because of the umask */
.mode = c->stat.mode & S_IFMT,
.rdev = c->stat.rdev,
.link = c->link
};
err = security_inode_copy_up(c->dentry, &new_creds);
temp = ERR_PTR(err);
if (err < 0)
goto out;
if (new_creds)
old_creds = override_creds(new_creds);
if (c->tmpfile)
temp = ovl_do_tmpfile(c->workdir, c->stat.mode);
else
temp = ovl_create_temp(c->workdir, &cattr);
out:
if (new_creds) {
revert_creds(old_creds);
put_cred(new_creds);
}
return temp;
}
static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp)
{
int err;
if (S_ISREG(c->stat.mode)) {
struct path upperpath;
ovl_path_upper(c->dentry, &upperpath);
BUG_ON(upperpath.dentry != NULL);
upperpath.dentry = temp;
err = ovl_copy_up_data(&c->lowerpath, &upperpath, c->stat.size);
if (err)
return err;
}
err = ovl_copy_xattr(c->lowerpath.dentry, temp);
if (err)
return err;
inode_lock(temp->d_inode);
err = ovl_set_attr(temp, &c->stat);
inode_unlock(temp->d_inode);
if (err)
return err;
/*
* Store identifier of lower inode in upper inode xattr to
* allow lookup of the copy up origin inode.
*
* Don't set origin when we are breaking the association with a lower
* hard link.
*/
if (c->origin) {
err = ovl_set_origin(c->dentry, c->lowerpath.dentry, temp);
if (err)
return err;
}
return 0;
}
static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c)
{
struct inode *udir = c->destdir->d_inode;
struct inode *inode;
struct dentry *newdentry = NULL;
struct dentry *temp;
int err;
temp = ovl_get_tmpfile(c);
if (IS_ERR(temp))
return PTR_ERR(temp);
err = ovl_copy_up_inode(c, temp);
if (err)
goto out;
if (S_ISDIR(c->stat.mode) && c->indexed) {
err = ovl_create_index(c->dentry, c->lowerpath.dentry, temp);
if (err)
goto out;
}
if (c->tmpfile) {
inode_lock_nested(udir, I_MUTEX_PARENT);
err = ovl_install_temp(c, temp, &newdentry);
inode_unlock(udir);
} else {
err = ovl_install_temp(c, temp, &newdentry);
}
if (err)
goto out;
inode = d_inode(c->dentry);
ovl_inode_update(inode, newdentry);
if (S_ISDIR(inode->i_mode))
ovl_set_flag(OVL_WHITEOUTS, inode);
out:
if (err && !c->tmpfile)
ovl_cleanup(d_inode(c->workdir), temp);
dput(temp);
return err;
}
/*
* Copy up a single dentry
*
* All renames start with copy up of source if necessary. The actual
* rename will only proceed once the copy up was successful. Copy up uses
* upper parent i_mutex for exclusion. Since rename can change d_parent it
* is possible that the copy up will lock the old parent. At that point
* the file will have already been copied up anyway.
*/
static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
{
int err;
struct ovl_fs *ofs = c->dentry->d_sb->s_fs_info;
bool to_index = false;
/*
* Indexed non-dir is copied up directly to the index entry and then
* hardlinked to upper dir. Indexed dir is copied up to indexdir,
* then index entry is created and then copied up dir installed.
* Copying dir up to indexdir instead of workdir simplifies locking.
*/
if (ovl_need_index(c->dentry)) {
c->indexed = true;
if (S_ISDIR(c->stat.mode))
c->workdir = ovl_indexdir(c->dentry->d_sb);
else
to_index = true;
}
if (S_ISDIR(c->stat.mode) || c->stat.nlink == 1 || to_index)
c->origin = true;
if (to_index) {
c->destdir = ovl_indexdir(c->dentry->d_sb);
err = ovl_get_index_name(c->lowerpath.dentry, &c->destname);
if (err)
return err;
} else if (WARN_ON(!c->parent)) {
/* Disconnected dentry must be copied up to index dir */
return -EIO;
} else {
/*
* Mark parent "impure" because it may now contain non-pure
* upper
*/
err = ovl_set_impure(c->parent, c->destdir);
if (err)
return err;
}
/* Should we copyup with O_TMPFILE or with workdir? */
if (S_ISREG(c->stat.mode) && ofs->tmpfile) {
c->tmpfile = true;
err = ovl_copy_up_locked(c);
} else {
err = ovl_lock_rename_workdir(c->workdir, c->destdir);
if (!err) {
err = ovl_copy_up_locked(c);
unlock_rename(c->workdir, c->destdir);
}
}
if (err)
goto out;
if (c->indexed)
ovl_set_flag(OVL_INDEX, d_inode(c->dentry));
if (to_index) {
/* Initialize nlink for copy up of disconnected dentry */
err = ovl_set_nlink_upper(c->dentry);
} else {
struct inode *udir = d_inode(c->destdir);
/* Restore timestamps on parent (best effort) */
inode_lock(udir);
ovl_set_timestamps(c->destdir, &c->pstat);
inode_unlock(udir);
ovl_dentry_set_upper_alias(c->dentry);
}
out:
if (to_index)
kfree(c->destname.name);
return err;
}
static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
int flags)
{
int err;
DEFINE_DELAYED_CALL(done);
struct path parentpath;
struct ovl_copy_up_ctx ctx = {
.parent = parent,
.dentry = dentry,
.workdir = ovl_workdir(dentry),
};
if (WARN_ON(!ctx.workdir))
return -EROFS;
ovl_path_lower(dentry, &ctx.lowerpath);
err = vfs_getattr(&ctx.lowerpath, &ctx.stat,
STATX_BASIC_STATS, AT_STATX_SYNC_AS_STAT);
if (err)
return err;
if (parent) {
ovl_path_upper(parent, &parentpath);
ctx.destdir = parentpath.dentry;
ctx.destname = dentry->d_name;
err = vfs_getattr(&parentpath, &ctx.pstat,
STATX_ATIME | STATX_MTIME,
AT_STATX_SYNC_AS_STAT);
if (err)
return err;
}
/* maybe truncate regular file. this has no effect on dirs */
if (flags & O_TRUNC)
ctx.stat.size = 0;
if (S_ISLNK(ctx.stat.mode)) {
ctx.link = vfs_get_link(ctx.lowerpath.dentry, &done);
if (IS_ERR(ctx.link))
return PTR_ERR(ctx.link);
}
ovl_do_check_copy_up(ctx.lowerpath.dentry);
err = ovl_copy_up_start(dentry);
/* err < 0: interrupted, err > 0: raced with another copy-up */
if (unlikely(err)) {
if (err > 0)
err = 0;
} else {
if (!ovl_dentry_upper(dentry))
err = ovl_do_copy_up(&ctx);
if (!err && parent && !ovl_dentry_has_upper_alias(dentry))
err = ovl_link_up(&ctx);
ovl_copy_up_end(dentry);
}
do_delayed_call(&done);
return err;
}
int ovl_copy_up_flags(struct dentry *dentry, int flags)
{
int err = 0;
const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
bool disconnected = (dentry->d_flags & DCACHE_DISCONNECTED);
/*
* With NFS export, copy up can get called for a disconnected non-dir.
* In this case, we will copy up lower inode to index dir without
* linking it to upper dir.
*/
if (WARN_ON(disconnected && d_is_dir(dentry)))
return -EIO;
while (!err) {
struct dentry *next;
struct dentry *parent = NULL;
/*
* Check if copy-up has happened as well as for upper alias (in
* case of hard links) is there.
*
* Both checks are lockless:
* - false negatives: will recheck under oi->lock
* - false positives:
* + ovl_dentry_upper() uses memory barriers to ensure the
* upper dentry is up-to-date
* + ovl_dentry_has_upper_alias() relies on locking of
* upper parent i_rwsem to prevent reordering copy-up
* with rename.
*/
if (ovl_dentry_upper(dentry) &&
(ovl_dentry_has_upper_alias(dentry) || disconnected))
break;
next = dget(dentry);
/* find the topmost dentry not yet copied up */
for (; !disconnected;) {
parent = dget_parent(next);
if (ovl_dentry_upper(parent))
break;
dput(next);
next = parent;
}
err = ovl_copy_up_one(parent, next, flags);
dput(parent);
dput(next);
}
revert_creds(old_cred);
return err;
}
int ovl_copy_up(struct dentry *dentry)
{
return ovl_copy_up_flags(dentry, 0);
}

File diff suppressed because it is too large Load Diff

View File

@ -1,853 +0,0 @@
/*
* Overlayfs NFS export support.
*
* Amir Goldstein <amir73il@gmail.com>
*
* Copyright (C) 2017-2018 CTERA Networks. All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/cred.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/xattr.h>
#include <linux/exportfs.h>
#include <linux/ratelimit.h>
#include <linux/version.h>
#include "overlayfs.h"
static int ovl_encode_maybe_copy_up(struct dentry *dentry)
{
int err;
if (ovl_dentry_upper(dentry))
return 0;
err = ovl_want_write(dentry);
if (!err) {
err = ovl_copy_up(dentry);
ovl_drop_write(dentry);
}
if (err) {
pr_warn_ratelimited("overlayfs: failed to copy up on encode (%pd2, err=%i)\n",
dentry, err);
}
return err;
}
/*
* Before encoding a non-upper directory file handle from real layer N, we need
* to check if it will be possible to reconnect an overlay dentry from the real
* lower decoded dentry. This is done by following the overlay ancestry up to a
* "layer N connected" ancestor and verifying that all parents along the way are
* "layer N connectable". If an ancestor that is NOT "layer N connectable" is
* found, we need to copy up an ancestor, which is "layer N connectable", thus
* making that ancestor "layer N connected". For example:
*
* layer 1: /a
* layer 2: /a/b/c
*
* The overlay dentry /a is NOT "layer 2 connectable", because if dir /a is
* copied up and renamed, upper dir /a will be indexed by lower dir /a from
* layer 1. The dir /a from layer 2 will never be indexed, so the algorithm (*)
* in ovl_lookup_real_ancestor() will not be able to lookup a connected overlay
* dentry from the connected lower dentry /a/b/c.
*
* To avoid this problem on decode time, we need to copy up an ancestor of
* /a/b/c, which is "layer 2 connectable", on encode time. That ancestor is
* /a/b. After copy up (and index) of /a/b, it will become "layer 2 connected"
* and when the time comes to decode the file handle from lower dentry /a/b/c,
* ovl_lookup_real_ancestor() will find the indexed ancestor /a/b and decoding
* a connected overlay dentry will be accomplished.
*
* (*) the algorithm in ovl_lookup_real_ancestor() can be improved to lookup an
* entry /a in the lower layers above layer N and find the indexed dir /a from
* layer 1. If that improvement is made, then the check for "layer N connected"
* will need to verify there are no redirects in lower layers above N. In the
* example above, /a will be "layer 2 connectable". However, if layer 2 dir /a
* is a target of a layer 1 redirect, then /a will NOT be "layer 2 connectable":
*
* layer 1: /A (redirect = /a)
* layer 2: /a/b/c
*/
/* Return the lowest layer for encoding a connectable file handle */
static int ovl_connectable_layer(struct dentry *dentry)
{
struct ovl_entry *oe = OVL_E(dentry);
/* We can get overlay root from root of any layer */
if (dentry == dentry->d_sb->s_root)
return oe->numlower;
/*
* If it's an unindexed merge dir, then it's not connectable with any
* lower layer
*/
if (ovl_dentry_upper(dentry) &&
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
return 0;
/* We can get upper/overlay path from indexed/lower dentry */
return oe->lowerstack[0].layer->idx;
}
/*
* @dentry is "connected" if all ancestors up to root or a "connected" ancestor
* have the same uppermost lower layer as the origin's layer. We may need to
* copy up a "connectable" ancestor to make it "connected". A "connected" dentry
* cannot become non "connected", so cache positive result in dentry flags.
*
* Return the connected origin layer or < 0 on error.
*/
static int ovl_connect_layer(struct dentry *dentry)
{
struct dentry *next, *parent = NULL;
int origin_layer;
int err = 0;
if (WARN_ON(dentry == dentry->d_sb->s_root) ||
WARN_ON(!ovl_dentry_lower(dentry)))
return -EIO;
origin_layer = OVL_E(dentry)->lowerstack[0].layer->idx;
if (ovl_dentry_test_flag(OVL_E_CONNECTED, dentry))
return origin_layer;
/* Find the topmost origin layer connectable ancestor of @dentry */
next = dget(dentry);
for (;;) {
parent = dget_parent(next);
if (WARN_ON(parent == next)) {
err = -EIO;
break;
}
/*
* If @parent is not origin layer connectable, then copy up
* @next which is origin layer connectable and we are done.
*/
if (ovl_connectable_layer(parent) < origin_layer) {
err = ovl_encode_maybe_copy_up(next);
break;
}
/* If @parent is connected or indexed we are done */
if (ovl_dentry_test_flag(OVL_E_CONNECTED, parent) ||
ovl_test_flag(OVL_INDEX, d_inode(parent)))
break;
dput(next);
next = parent;
}
dput(parent);
dput(next);
if (!err)
ovl_dentry_set_flag(OVL_E_CONNECTED, dentry);
return err ?: origin_layer;
}
/*
* We only need to encode origin if there is a chance that the same object was
* encoded pre copy up and then we need to stay consistent with the same
* encoding also after copy up. If non-pure upper is not indexed, then it was
* copied up before NFS export was enabled. In that case we don't need to worry
* about staying consistent with pre copy up encoding and we encode an upper
* file handle. Overlay root dentry is a private case of non-indexed upper.
*
* The following table summarizes the different file handle encodings used for
* different overlay object types:
*
* Object type | Encoding
* --------------------------------
* Pure upper | U
* Non-indexed upper | U
* Indexed upper | L (*)
* Non-upper | L (*)
*
* U = upper file handle
* L = lower file handle
*
* (*) Connecting an overlay dir from real lower dentry is not always
* possible when there are redirects in lower layers and non-indexed merge dirs.
* To mitigate those case, we may copy up the lower dir ancestor before encode
* a lower dir file handle.
*
* Return 0 for upper file handle, > 0 for lower file handle or < 0 on error.
*/
static int ovl_check_encode_origin(struct dentry *dentry)
{
struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
/* Upper file handle for pure upper */
if (!ovl_dentry_lower(dentry))
return 0;
/*
* Upper file handle for non-indexed upper.
*
* Root is never indexed, so if there's an upper layer, encode upper for
* root.
*/
if (ovl_dentry_upper(dentry) &&
!ovl_test_flag(OVL_INDEX, d_inode(dentry)))
return 0;
/*
* Decoding a merge dir, whose origin's ancestor is under a redirected
* lower dir or under a non-indexed upper is not always possible.
* ovl_connect_layer() will try to make origin's layer "connected" by
* copying up a "connectable" ancestor.
*/
if (d_is_dir(dentry) && ofs->upper_mnt)
return ovl_connect_layer(dentry);
/* Lower file handle for indexed and non-upper dir/non-dir */
return 1;
}
static int ovl_d_to_fh(struct dentry *dentry, char *buf, int buflen)
{
struct ovl_fh *fh = NULL;
int err, enc_lower;
/*
* Check if we should encode a lower or upper file handle and maybe
* copy up an ancestor to make lower file handle connectable.
*/
err = enc_lower = ovl_check_encode_origin(dentry);
if (enc_lower < 0)
goto fail;
/* Encode an upper or lower file handle */
fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) :
ovl_dentry_upper(dentry), !enc_lower);
err = PTR_ERR(fh);
if (IS_ERR(fh))
goto fail;
err = -EOVERFLOW;
if (fh->len > buflen)
goto fail;
memcpy(buf, (char *)fh, fh->len);
err = fh->len;
out:
kfree(fh);
return err;
fail:
pr_warn_ratelimited("overlayfs: failed to encode file handle (%pd2, err=%i, buflen=%d, len=%d, type=%d)\n",
dentry, err, buflen, fh ? (int)fh->len : 0,
fh ? fh->type : 0);
goto out;
}
static int ovl_dentry_to_fh(struct dentry *dentry, u32 *fid, int *max_len)
{
int res, len = *max_len << 2;
res = ovl_d_to_fh(dentry, (char *)fid, len);
if (res <= 0)
return FILEID_INVALID;
len = res;
/* Round up to dwords */
*max_len = (len + 3) >> 2;
return OVL_FILEID;
}
static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
struct inode *parent)
{
struct dentry *dentry;
int type;
/* TODO: encode connectable file handles */
if (parent)
return FILEID_INVALID;
dentry = d_find_any_alias(inode);
if (WARN_ON(!dentry))
return FILEID_INVALID;
type = ovl_dentry_to_fh(dentry, fid, max_len);
dput(dentry);
return type;
}
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
/*
* Find or instantiate an overlay dentry from real dentries and index.
*/
static struct dentry *ovl_obtain_alias(struct super_block *sb,
struct dentry *upper_alias,
struct ovl_path *lowerpath,
struct dentry *index)
{
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
struct dentry *upper = upper_alias ?: index;
struct dentry *dentry;
struct inode *inode;
struct ovl_entry *oe;
struct ovl_inode_params oip = {
.lowerpath = lowerpath,
.index = index,
.numlower = !!lower
};
/* We get overlay directory dentries with ovl_lookup_real() */
if (d_is_dir(upper ?: lower))
return ERR_PTR(-EIO);
oip.upperdentry = dget(upper);
inode = ovl_get_inode(sb, &oip);
if (IS_ERR(inode)) {
dput(upper);
return ERR_CAST(inode);
}
dentry = d_find_any_alias(inode);
if (!dentry) {
dentry = d_alloc_anon(inode->i_sb);
if (!dentry)
goto nomem;
oe = ovl_alloc_entry(lower ? 1 : 0);
if (!oe)
goto nomem;
if (lower) {
oe->lowerstack->dentry = dget(lower);
oe->lowerstack->layer = lowerpath->layer;
}
dentry->d_fsdata = oe;
if (upper_alias)
ovl_dentry_set_upper_alias(dentry);
}
return d_instantiate_anon(dentry, inode);
nomem:
iput(inode);
dput(dentry);
return ERR_PTR(-ENOMEM);
}
/* Get the upper or lower dentry in stach whose on layer @idx */
static struct dentry *ovl_dentry_real_at(struct dentry *dentry, int idx)
{
struct ovl_entry *oe = dentry->d_fsdata;
int i;
if (!idx)
return ovl_dentry_upper(dentry);
for (i = 0; i < oe->numlower; i++) {
if (oe->lowerstack[i].layer->idx == idx)
return oe->lowerstack[i].dentry;
}
return NULL;
}
/*
* Lookup a child overlay dentry to get a connected overlay dentry whose real
* dentry is @real. If @real is on upper layer, we lookup a child overlay
* dentry with the same name as the real dentry. Otherwise, we need to consult
* index for lookup.
*/
static struct dentry *ovl_lookup_real_one(struct dentry *connected,
struct dentry *real,
struct ovl_layer *layer)
{
struct inode *dir = d_inode(connected);
struct dentry *this, *parent = NULL;
struct name_snapshot name;
int err;
/*
* Lookup child overlay dentry by real name. The dir mutex protects us
* from racing with overlay rename. If the overlay dentry that is above
* real has already been moved to a parent that is not under the
* connected overlay dir, we return -ECHILD and restart the lookup of
* connected real path from the top.
*/
inode_lock_nested(dir, I_MUTEX_PARENT);
err = -ECHILD;
parent = dget_parent(real);
if (ovl_dentry_real_at(connected, layer->idx) != parent)
goto fail;
/*
* We also need to take a snapshot of real dentry name to protect us
* from racing with underlying layer rename. In this case, we don't
* care about returning ESTALE, only from dereferencing a free name
* pointer because we hold no lock on the real dentry.
*/
take_dentry_name_snapshot(&name, real);
this = lookup_one_len(name.name, connected, strlen(name.name));
err = PTR_ERR(this);
if (IS_ERR(this)) {
goto fail;
} else if (!this || !this->d_inode) {
dput(this);
err = -ENOENT;
goto fail;
} else if (ovl_dentry_real_at(this, layer->idx) != real) {
dput(this);
err = -ESTALE;
goto fail;
}
out:
release_dentry_name_snapshot(&name);
dput(parent);
inode_unlock(dir);
return this;
fail:
pr_warn_ratelimited("overlayfs: failed to lookup one by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
real, layer->idx, connected, err);
this = ERR_PTR(err);
goto out;
}
static struct dentry *ovl_lookup_real(struct super_block *sb,
struct dentry *real,
struct ovl_layer *layer);
/*
* Lookup an indexed or hashed overlay dentry by real inode.
*/
static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
struct dentry *real,
struct ovl_layer *layer)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
struct dentry *index = NULL;
struct dentry *this = NULL;
struct inode *inode;
/*
* Decoding upper dir from index is expensive, so first try to lookup
* overlay dentry in inode/dcache.
*/
inode = ovl_lookup_inode(sb, real, !layer->idx);
if (IS_ERR(inode))
return ERR_CAST(inode);
if (inode) {
this = d_find_any_alias(inode);
iput(inode);
}
/*
* For decoded lower dir file handle, lookup index by origin to check
* if lower dir was copied up and and/or removed.
*/
if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
index = ovl_lookup_index(ofs, NULL, real, false);
if (IS_ERR(index))
return index;
}
/* Get connected upper overlay dir from index */
if (index) {
struct dentry *upper = ovl_index_upper(ofs, index);
dput(index);
if (IS_ERR_OR_NULL(upper))
return upper;
/*
* ovl_lookup_real() in lower layer may call recursively once to
* ovl_lookup_real() in upper layer. The first level call walks
* back lower parents to the topmost indexed parent. The second
* recursive call walks back from indexed upper to the topmost
* connected/hashed upper parent (or up to root).
*/
this = ovl_lookup_real(sb, upper, &upper_layer);
dput(upper);
}
if (IS_ERR_OR_NULL(this))
return this;
if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
dput(this);
this = ERR_PTR(-EIO);
}
return this;
}
/*
* Lookup an indexed or hashed overlay dentry, whose real dentry is an
* ancestor of @real.
*/
static struct dentry *ovl_lookup_real_ancestor(struct super_block *sb,
struct dentry *real,
struct ovl_layer *layer)
{
struct dentry *next, *parent = NULL;
struct dentry *ancestor = ERR_PTR(-EIO);
if (real == layer->mnt->mnt_root)
return dget(sb->s_root);
/* Find the topmost indexed or hashed ancestor */
next = dget(real);
for (;;) {
parent = dget_parent(next);
/*
* Lookup a matching overlay dentry in inode/dentry
* cache or in index by real inode.
*/
ancestor = ovl_lookup_real_inode(sb, next, layer);
if (ancestor)
break;
if (parent == layer->mnt->mnt_root) {
ancestor = dget(sb->s_root);
break;
}
/*
* If @real has been moved out of the layer root directory,
* we will eventully hit the real fs root. This cannot happen
* by legit overlay rename, so we return error in that case.
*/
if (parent == next) {
ancestor = ERR_PTR(-EXDEV);
break;
}
dput(next);
next = parent;
}
dput(parent);
dput(next);
return ancestor;
}
/*
* Lookup a connected overlay dentry whose real dentry is @real.
* If @real is on upper layer, we lookup a child overlay dentry with the same
* path the real dentry. Otherwise, we need to consult index for lookup.
*/
static struct dentry *ovl_lookup_real(struct super_block *sb,
struct dentry *real,
struct ovl_layer *layer)
{
struct dentry *connected;
int err = 0;
connected = ovl_lookup_real_ancestor(sb, real, layer);
if (IS_ERR(connected))
return connected;
while (!err) {
struct dentry *next, *this;
struct dentry *parent = NULL;
struct dentry *real_connected = ovl_dentry_real_at(connected,
layer->idx);
if (real_connected == real)
break;
/* Find the topmost dentry not yet connected */
next = dget(real);
for (;;) {
parent = dget_parent(next);
if (parent == real_connected)
break;
/*
* If real has been moved out of 'real_connected',
* we will not find 'real_connected' and hit the layer
* root. In that case, we need to restart connecting.
* This game can go on forever in the worst case. We
* may want to consider taking s_vfs_rename_mutex if
* this happens more than once.
*/
if (parent == layer->mnt->mnt_root) {
dput(connected);
connected = dget(sb->s_root);
break;
}
/*
* If real file has been moved out of the layer root
* directory, we will eventully hit the real fs root.
* This cannot happen by legit overlay rename, so we
* return error in that case.
*/
if (parent == next) {
err = -EXDEV;
break;
}
dput(next);
next = parent;
}
if (!err) {
this = ovl_lookup_real_one(connected, next, layer);
if (IS_ERR(this))
err = PTR_ERR(this);
/*
* Lookup of child in overlay can fail when racing with
* overlay rename of child away from 'connected' parent.
* In this case, we need to restart the lookup from the
* top, because we cannot trust that 'real_connected' is
* still an ancestor of 'real'. There is a good chance
* that the renamed overlay ancestor is now in cache, so
* ovl_lookup_real_ancestor() will find it and we can
* continue to connect exactly from where lookup failed.
*/
if (err == -ECHILD) {
this = ovl_lookup_real_ancestor(sb, real,
layer);
err = PTR_ERR_OR_ZERO(this);
}
if (!err) {
dput(connected);
connected = this;
}
}
dput(parent);
dput(next);
}
if (err)
goto fail;
return connected;
fail:
pr_warn_ratelimited("overlayfs: failed to lookup by real (%pd2, layer=%d, connected=%pd2, err=%i)\n",
real, layer->idx, connected, err);
dput(connected);
return ERR_PTR(err);
}
/*
* Get an overlay dentry from upper/lower real dentries and index.
*/
static struct dentry *ovl_get_dentry(struct super_block *sb,
struct dentry *upper,
struct ovl_path *lowerpath,
struct dentry *index)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_layer upper_layer = { .mnt = ofs->upper_mnt };
struct ovl_layer *layer = upper ? &upper_layer : lowerpath->layer;
struct dentry *real = upper ?: (index ?: lowerpath->dentry);
/*
* Obtain a disconnected overlay dentry from a non-dir real dentry
* and index.
*/
if (!d_is_dir(real))
return ovl_obtain_alias(sb, upper, lowerpath, index);
/* Removed empty directory? */
if ((real->d_flags & DCACHE_DISCONNECTED) || d_unhashed(real))
return ERR_PTR(-ENOENT);
/*
* If real dentry is connected and hashed, get a connected overlay
* dentry whose real dentry is @real.
*/
return ovl_lookup_real(sb, real, layer);
}
static struct dentry *ovl_upper_fh_to_d(struct super_block *sb,
struct ovl_fh *fh)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct dentry *dentry;
struct dentry *upper;
if (!ofs->upper_mnt)
return ERR_PTR(-EACCES);
upper = ovl_decode_real_fh(fh, ofs->upper_mnt, true);
if (IS_ERR_OR_NULL(upper))
return upper;
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
dput(upper);
return dentry;
}
static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
struct ovl_fh *fh)
{
struct ovl_fs *ofs = sb->s_fs_info;
struct ovl_path origin = { };
struct ovl_path *stack = &origin;
struct dentry *dentry = NULL;
struct dentry *index = NULL;
struct inode *inode;
int err;
/* First lookup overlay inode in inode cache by origin fh */
err = ovl_check_origin_fh(ofs, fh, false, NULL, &stack);
if (err)
return ERR_PTR(err);
if (!d_is_dir(origin.dentry) ||
!(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
inode = ovl_lookup_inode(sb, origin.dentry, false);
err = PTR_ERR(inode);
if (IS_ERR(inode))
goto out_err;
if (inode) {
dentry = d_find_any_alias(inode);
iput(inode);
if (dentry)
goto out;
}
}
/* Then lookup indexed upper/whiteout by origin fh */
if (ofs->indexdir) {
index = ovl_get_index_fh(ofs, fh);
err = PTR_ERR(index);
if (IS_ERR(index)) {
index = NULL;
goto out_err;
}
}
/* Then try to get a connected upper dir by index */
if (index && d_is_dir(index)) {
struct dentry *upper = ovl_index_upper(ofs, index);
err = PTR_ERR(upper);
if (IS_ERR_OR_NULL(upper))
goto out_err;
dentry = ovl_get_dentry(sb, upper, NULL, NULL);
dput(upper);
goto out;
}
/* Otherwise, get a connected non-upper dir or disconnected non-dir */
if (d_is_dir(origin.dentry) &&
(origin.dentry->d_flags & DCACHE_DISCONNECTED)) {
dput(origin.dentry);
origin.dentry = NULL;
err = ovl_check_origin_fh(ofs, fh, true, NULL, &stack);
if (err)
goto out_err;
}
if (index) {
err = ovl_verify_origin(index, origin.dentry, false);
if (err)
goto out_err;
}
dentry = ovl_get_dentry(sb, NULL, &origin, index);
out:
dput(origin.dentry);
dput(index);
return dentry;
out_err:
dentry = ERR_PTR(err);
goto out;
}
static struct dentry *ovl_fh_to_dentry(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
struct dentry *dentry = NULL;
struct ovl_fh *fh = (struct ovl_fh *) fid;
int len = fh_len << 2;
unsigned int flags = 0;
int err;
err = -EINVAL;
if (fh_type != OVL_FILEID)
goto out_err;
err = ovl_check_fh_len(fh, len);
if (err)
goto out_err;
flags = fh->flags;
dentry = (flags & OVL_FH_FLAG_PATH_UPPER) ?
ovl_upper_fh_to_d(sb, fh) :
ovl_lower_fh_to_d(sb, fh);
err = PTR_ERR(dentry);
if (IS_ERR(dentry) && err != -ESTALE)
goto out_err;
return dentry;
out_err:
pr_warn_ratelimited("overlayfs: failed to decode file handle (len=%d, type=%d, flags=%x, err=%i)\n",
len, fh_type, flags, err);
return ERR_PTR(err);
}
static struct dentry *ovl_fh_to_parent(struct super_block *sb, struct fid *fid,
int fh_len, int fh_type)
{
pr_warn_ratelimited("overlayfs: connectable file handles not supported; use 'no_subtree_check' exportfs option.\n");
return ERR_PTR(-EACCES);
}
static int ovl_get_name(struct dentry *parent, char *name,
struct dentry *child)
{
/*
* ovl_fh_to_dentry() returns connected dir overlay dentries and
* ovl_fh_to_parent() is not implemented, so we should not get here.
*/
WARN_ON_ONCE(1);
return -EIO;
}
static struct dentry *ovl_get_parent(struct dentry *dentry)
{
/*
* ovl_fh_to_dentry() returns connected dir overlay dentries, so we
* should not get here.
*/
WARN_ON_ONCE(1);
return ERR_PTR(-EIO);
}
#endif
const struct export_operations ovl_export_operations = {
.encode_fh = ovl_encode_fh,
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 16, 0)
.fh_to_dentry = ovl_fh_to_dentry,
.fh_to_parent = ovl_fh_to_parent,
.get_name = ovl_get_name,
.get_parent = ovl_get_parent,
#endif
};

View File

@ -1,874 +0,0 @@
/*
*
* Copyright (C) 2011 Novell Inc.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published by
* the Free Software Foundation.
*/
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/cred.h>
#include <linux/xattr.h>
#include <linux/posix_acl.h>
#include <linux/ratelimit.h>
#include <linux/version.h>
#include "overlayfs.h"
int ovl_setattr(struct dentry *dentry, struct iattr *attr)
{
int err;
struct dentry *upperdentry;
const struct cred *old_cred;
/* NOCOPYUPW */
return 0;
/*
* Check for permissions before trying to copy-up. This is redundant
* since it will be rechecked later by ->setattr() on upper dentry. But
* without this, copy-up can be triggered by just about anybody.
*
* We don't initialize inode->size, which just means that
* inode_newsize_ok() will always check against MAX_LFS_FILESIZE and not
* check for a swapfile (which this won't be anyway).
*/
err = setattr_prepare(dentry, attr);
if (err)
return err;
err = ovl_want_write(dentry);
if (err)
goto out;
err = ovl_copy_up(dentry);
if (!err) {
upperdentry = ovl_dentry_upper(dentry);
if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
attr->ia_valid &= ~ATTR_MODE;
inode_lock(upperdentry->d_inode);
old_cred = ovl_override_creds(dentry->d_sb);
err = notify_change(upperdentry, attr, NULL);
revert_creds(old_cred);
if (!err)
ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
inode_unlock(upperdentry->d_inode);
}
ovl_drop_write(dentry);
out:
return err;
}
static int ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat,
struct ovl_layer *lower_layer)
{
bool samefs = ovl_same_sb(dentry->d_sb);
unsigned int xinobits = ovl_xino_bits(dentry->d_sb);
if (samefs) {
/*
* When all layers are on the same fs, all real inode
* number are unique, so we use the overlay st_dev,
* which is friendly to du -x.
*/
stat->dev = dentry->d_sb->s_dev;
return 0;
} else if (xinobits) {
unsigned int shift = 64 - xinobits;
/*
* All inode numbers of underlying fs should not be using the
* high xinobits, so we use high xinobits to partition the
* overlay st_ino address space. The high bits holds the fsid
* (upper fsid is 0). This way overlay inode numbers are unique
* and all inodes use overlay st_dev. Inode numbers are also
* persistent for a given layer configuration.
*/
if (stat->ino >> shift) {
pr_warn_ratelimited("overlayfs: inode number too big (%pd2, ino=%llu, xinobits=%d)\n",
dentry, stat->ino, xinobits);
} else {
if (lower_layer)
stat->ino |= ((u64)lower_layer->fsid) << shift;
stat->dev = dentry->d_sb->s_dev;
return 0;
}
}
/* The inode could not be mapped to a unified st_ino address space */
if (S_ISDIR(dentry->d_inode->i_mode)) {
/*
* Always use the overlay st_dev for directories, so 'find
* -xdev' will scan the entire overlay mount and won't cross the
* overlay mount boundaries.
*
* If not all layers are on the same fs the pair {real st_ino;
* overlay st_dev} is not unique, so use the non persistent
* overlay st_ino for directories.
*/
stat->dev = dentry->d_sb->s_dev;
stat->ino = dentry->d_inode->i_ino;
} else if (lower_layer && lower_layer->fsid) {
/*
* For non-samefs setup, if we cannot map all layers st_ino
* to a unified address space, we need to make sure that st_dev
* is unique per lower fs. Upper layer uses real st_dev and
* lower layers use the unique anonymous bdev assigned to the
* lower fs.
*/
stat->dev = lower_layer->fs->pseudo_dev;
}
return 0;
}
int ovl_getattr(const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int flags)
{
struct dentry *dentry = path->dentry;
enum ovl_path_type type;
struct path realpath;
const struct cred *old_cred;
bool is_dir = S_ISDIR(dentry->d_inode->i_mode);
bool samefs = ovl_same_sb(dentry->d_sb);
struct ovl_layer *lower_layer = NULL;
int err;
type = ovl_path_real(dentry, &realpath);
old_cred = ovl_override_creds(dentry->d_sb);
err = vfs_getattr(&realpath, stat, request_mask, flags);
if (err)
goto out;
/*
* For non-dir or same fs, we use st_ino of the copy up origin.
* This guaranties constant st_dev/st_ino across copy up.
* With xino feature and non-samefs, we use st_ino of the copy up
* origin masked with high bits that represent the layer id.
*
* If lower filesystem supports NFS file handles, this also guaranties
* persistent st_ino across mount cycle.
*/
if (!is_dir || samefs || ovl_xino_bits(dentry->d_sb)) {
if (!OVL_TYPE_UPPER(type)) {
lower_layer = ovl_layer_lower(dentry);
} else if (OVL_TYPE_ORIGIN(type)) {
struct kstat lowerstat;
u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0);
ovl_path_lower(dentry, &realpath);
err = vfs_getattr(&realpath, &lowerstat,
lowermask, flags);
if (err)
goto out;
/*
* Lower hardlinks may be broken on copy up to different
* upper files, so we cannot use the lower origin st_ino
* for those different files, even for the same fs case.
*
* Similarly, several redirected dirs can point to the
* same dir on a lower layer. With the "verify_lower"
* feature, we do not use the lower origin st_ino, if
* we haven't verified that this redirect is unique.
*
* With inodes index enabled, it is safe to use st_ino
* of an indexed origin. The index validates that the
* upper hardlink is not broken and that a redirected
* dir is the only redirect to that origin.
*/
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
(is_dir || lowerstat.nlink == 1))) {
stat->ino = lowerstat.ino;
lower_layer = ovl_layer_lower(dentry);
}
}
}
err = ovl_map_dev_ino(dentry, stat, lower_layer);
if (err)
goto out;
/*
* It's probably not worth it to count subdirs to get the
* correct link count. nlink=1 seems to pacify 'find' and
* other utilities.
*/
if (is_dir && OVL_TYPE_MERGE(type))
stat->nlink = 1;
/*
* Return the overlay inode nlinks for indexed upper inodes.
* Overlay inode nlink counts the union of the upper hardlinks
* and non-covered lower hardlinks. It does not include the upper
* index hardlink.
*/
if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry)))
stat->nlink = dentry->d_inode->i_nlink;
out:
revert_creds(old_cred);
return err;
}
int ovl_permission(struct inode *inode, int mask)
{
struct inode *upperinode = ovl_inode_upper(inode);
struct inode *realinode = upperinode ?: ovl_inode_lower(inode);
const struct cred *old_cred;
int err;
/* Careful in RCU walk mode */
if (!realinode) {
WARN_ON(!(mask & MAY_NOT_BLOCK));
return -ECHILD;
}
/*
* Check overlay inode with the creds of task and underlying inode
* with creds of mounter
*/
err = generic_permission(inode, mask);
if (err)
return err;
old_cred = ovl_override_creds(inode->i_sb);
if (!upperinode &&
!special_file(realinode->i_mode) && mask & MAY_WRITE) {
mask &= ~(MAY_WRITE | MAY_APPEND);
/* Make sure mounter can read file for copy up later */
mask |= MAY_READ;
}
err = inode_permission(realinode, mask);
revert_creds(old_cred);
return err;
}
static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
{
struct path realpath;
struct inode *realinode;
ovl_path_real(dentry, &realpath);
realinode = realpath.dentry->d_inode;
if (!realinode->i_op->readlink)
return -EINVAL;
touch_atime(&realpath);
return realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
}
static const char *ovl_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
const struct cred *old_cred;
const char *p;
if (!dentry)
return ERR_PTR(-ECHILD);
old_cred = ovl_override_creds(dentry->d_sb);
p = vfs_get_link(ovl_dentry_real(dentry), done);
revert_creds(old_cred);
return p;
}
bool ovl_is_private_xattr(const char *name)
{
return strncmp(name, OVL_XATTR_PREFIX,
sizeof(OVL_XATTR_PREFIX) - 1) == 0;
}
int ovl_xattr_set(struct dentry *dentry, struct inode *inode, const char *name,
const void *value, size_t size, int flags)
{
int err;
struct dentry *upperdentry = ovl_i_dentry_upper(inode);
struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry);
const struct cred *old_cred;
/* NOCOPYUPW */
return 0;
err = ovl_want_write(dentry);
if (err)
goto out;
if (!value && !upperdentry) {
err = vfs_getxattr(realdentry, name, NULL, 0);
if (err < 0)
goto out_drop_write;
}
if (!upperdentry) {
err = ovl_copy_up(dentry);
if (err)
goto out_drop_write;
realdentry = ovl_dentry_upper(dentry);
}
old_cred = ovl_override_creds(dentry->d_sb);
if (value)
err = vfs_setxattr(realdentry, name, value, size, flags);
else {
WARN_ON(flags != XATTR_REPLACE);
err = vfs_removexattr(realdentry, name);
}
revert_creds(old_cred);
out_drop_write:
ovl_drop_write(dentry);
out:
return err;
}
int ovl_xattr_get(struct dentry *dentry, struct inode *inode, const char *name,
void *value, size_t size)
{
ssize_t res;
const struct cred *old_cred;
struct dentry *realdentry =
ovl_i_dentry_upper(inode) ?: ovl_dentry_lower(dentry);
old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_getxattr(realdentry, name, value, size);
revert_creds(old_cred);
return res;
}
static bool ovl_can_list(const char *s)
{
/* List all non-trusted xatts */
if (strncmp(s, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) != 0)
return true;
/* Never list trusted.overlay, list other trusted for superuser only */
return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN);
}
ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size)
{
struct dentry *realdentry = ovl_dentry_real(dentry);
ssize_t res;
size_t len;
char *s;
const struct cred *old_cred;
old_cred = ovl_override_creds(dentry->d_sb);
res = vfs_listxattr(realdentry, list, size);
revert_creds(old_cred);
if (res <= 0 || size == 0)
return res;
/* filter out private xattrs */
for (s = list, len = res; len;) {
size_t slen = strnlen(s, len) + 1;
/* underlying fs providing us with an broken xattr list? */
if (WARN_ON(slen > len))
return -EIO;
len -= slen;
if (!ovl_can_list(s)) {
res -= slen;
memmove(s, s + slen, len);
} else {
s += slen;
}
}
return res;
}
struct posix_acl *ovl_get_acl(struct inode *inode, int type)
{
struct inode *realinode = ovl_inode_real(inode);
const struct cred *old_cred;
struct posix_acl *acl;
if (!IS_ENABLED(CONFIG_FS_POSIX_ACL) || !IS_POSIXACL(realinode))
return NULL;
old_cred = ovl_override_creds(inode->i_sb);
acl = get_acl(realinode, type);
revert_creds(old_cred);
return acl;
}
static bool ovl_open_need_copy_up(struct dentry *dentry, int flags)
{
/* Copy up of disconnected dentry does not set upper alias */
if (ovl_dentry_upper(dentry) &&
(ovl_dentry_has_upper_alias(dentry) ||
(dentry->d_flags & DCACHE_DISCONNECTED)))
return false;
if (special_file(d_inode(dentry)->i_mode))
return false;
if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC))
return false;
return true;
}
int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
{
int err = 0;
/* NOCOPYUPW */
return err;
if (ovl_open_need_copy_up(dentry, file_flags)) {
err = ovl_want_write(dentry);
if (!err) {
err = ovl_copy_up_flags(dentry, file_flags);
ovl_drop_write(dentry);
}
}
return err;
}
int ovl_update_time(struct inode *inode, struct timespec64 *ts, int flags)
{
if (flags & S_ATIME) {
struct ovl_fs *ofs = inode->i_sb->s_fs_info;
struct path upperpath = {
.mnt = ofs->upper_mnt,
.dentry = ovl_upperdentry_dereference(OVL_I(inode)),
};
if (upperpath.dentry) {
touch_atime(&upperpath);
inode->i_atime = d_inode(upperpath.dentry)->i_atime;
}
}
return 0;
}
static const struct inode_operations ovl_file_inode_operations = {
.setattr = ovl_setattr,
.permission = ovl_permission,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.get_acl = ovl_get_acl,
.update_time = ovl_update_time,
};
static const struct inode_operations ovl_symlink_inode_operations = {
.setattr = ovl_setattr,
.get_link = ovl_get_link,
.readlink = ovl_readlink,
.getattr = ovl_getattr,
.listxattr = ovl_listxattr,
.update_time = ovl_update_time,
};
/*
* It is possible to stack overlayfs instance on top of another
* overlayfs instance as lower layer. We need to annonate the
* stackable i_mutex locks according to stack level of the super
* block instance. An overlayfs instance can never be in stack
* depth 0 (there is always a real fs below it). An overlayfs
* inode lock will use the lockdep annotaion ovl_i_mutex_key[depth].
*
* For example, here is a snip from /proc/lockdep_chains after
* dir_iterate of nested overlayfs:
*
* [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2)
* [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1)
* [...] &type->i_mutex_dir_key (stack_depth=0)
*/
#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH
static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode)
{
#ifdef CONFIG_LOCKDEP
static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING];
static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING];
static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING];
int depth = inode->i_sb->s_stack_depth - 1;
if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING))
depth = 0;
if (S_ISDIR(inode->i_mode))
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]);
else
lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]);
lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]);
#endif
}
static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
unsigned long ino, int fsid)
{
int xinobits = ovl_xino_bits(inode->i_sb);
/*
* When NFS export is enabled and d_ino is consistent with st_ino
* (samefs or i_ino has enough bits to encode layer), set the same
* value used for d_ino to i_ino, because nfsd readdirplus compares
* d_ino values to i_ino values of child entries. When called from
* ovl_new_inode(), ino arg is 0, so i_ino will be updated to real
* upper inode i_ino on ovl_inode_init() or ovl_inode_update().
*/
if (inode->i_sb->s_export_op &&
(ovl_same_sb(inode->i_sb) || xinobits)) {
inode->i_ino = ino;
if (xinobits && fsid && !(ino >> (64 - xinobits)))
inode->i_ino |= (unsigned long)fsid << (64 - xinobits);
} else {
inode->i_ino = get_next_ino();
}
inode->i_mode = mode;
inode->i_flags |= S_NOCMTIME;
#ifdef CONFIG_FS_POSIX_ACL
inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
#endif
ovl_lockdep_annotate_inode_mutex_key(inode);
switch (mode & S_IFMT) {
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
break;
case S_IFDIR:
inode->i_op = &ovl_dir_inode_operations;
inode->i_fop = &ovl_dir_operations;
break;
case S_IFLNK:
inode->i_op = &ovl_symlink_inode_operations;
break;
default:
inode->i_op = &ovl_file_inode_operations;
init_special_inode(inode, mode, rdev);
break;
}
}
/*
* With inodes index enabled, an overlay inode nlink counts the union of upper
* hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure
* upper inode, the following nlink modifying operations can happen:
*
* 1. Lower hardlink copy up
* 2. Upper hardlink created, unlinked or renamed over
* 3. Lower hardlink whiteout or renamed over
*
* For the first, copy up case, the union nlink does not change, whether the
* operation succeeds or fails, but the upper inode nlink may change.
* Therefore, before copy up, we store the union nlink value relative to the
* lower inode nlink in the index inode xattr trusted.overlay.nlink.
*
* For the second, upper hardlink case, the union nlink should be incremented
* or decremented IFF the operation succeeds, aligned with nlink change of the
* upper inode. Therefore, before link/unlink/rename, we store the union nlink
* value relative to the upper inode nlink in the index inode.
*
* For the last, lower cover up case, we simplify things by preceding the
* whiteout or cover up with copy up. This makes sure that there is an index
* upper inode where the nlink xattr can be stored before the copied up upper
* entry is unlink.
*/
#define OVL_NLINK_ADD_UPPER (1 << 0)
/*
* On-disk format for indexed nlink:
*
* nlink relative to the upper inode - "U[+-]NUM"
* nlink relative to the lower inode - "L[+-]NUM"
*/
static int ovl_set_nlink_common(struct dentry *dentry,
struct dentry *realdentry, const char *format)
{
struct inode *inode = d_inode(dentry);
struct inode *realinode = d_inode(realdentry);
char buf[13];
int len;
len = snprintf(buf, sizeof(buf), format,
(int) (inode->i_nlink - realinode->i_nlink));
if (WARN_ON(len >= sizeof(buf)))
return -EIO;
return ovl_do_setxattr(ovl_dentry_upper(dentry),
OVL_XATTR_NLINK, buf, len, 0);
}
int ovl_set_nlink_upper(struct dentry *dentry)
{
return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i");
}
int ovl_set_nlink_lower(struct dentry *dentry)
{
return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i");
}
unsigned int ovl_get_nlink(struct dentry *lowerdentry,
struct dentry *upperdentry,
unsigned int fallback)
{
int nlink_diff;
int nlink;
char buf[13];
int err;
if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1)
return fallback;
err = vfs_getxattr(upperdentry, OVL_XATTR_NLINK, &buf, sizeof(buf) - 1);
if (err < 0)
goto fail;
buf[err] = '\0';
if ((buf[0] != 'L' && buf[0] != 'U') ||
(buf[1] != '+' && buf[1] != '-'))
goto fail;
err = kstrtoint(buf + 1, 10, &nlink_diff);
if (err < 0)
goto fail;
nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink;
nlink += nlink_diff;
if (nlink <= 0)
goto fail;
return nlink;
fail:
pr_warn_ratelimited("overlayfs: failed to get index nlink (%pd2, err=%i)\n",
upperdentry, err);
return fallback;
}
struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
{
struct inode *inode;
inode = new_inode(sb);
if (inode)
ovl_fill_inode(inode, mode, rdev, 0, 0);
return inode;
}
static int ovl_inode_test(struct inode *inode, void *data)
{
return inode->i_private == data;
}
static int ovl_inode_set(struct inode *inode, void *data)
{
inode->i_private = data;
return 0;
}
static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry,
struct dentry *upperdentry, bool strict)
{
/*
* For directories, @strict verify from lookup path performs consistency
* checks, so NULL lower/upper in dentry must match NULL lower/upper in
* inode. Non @strict verify from NFS handle decode path passes NULL for
* 'unknown' lower/upper.
*/
if (S_ISDIR(inode->i_mode) && strict) {
/* Real lower dir moved to upper layer under us? */
if (!lowerdentry && ovl_inode_lower(inode))
return false;
/* Lookup of an uncovered redirect origin? */
if (!upperdentry && ovl_inode_upper(inode))
return false;
}
/*
* Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL.
* This happens when finding a copied up overlay inode for a renamed
* or hardlinked overlay dentry and lower dentry cannot be followed
* by origin because lower fs does not support file handles.
*/
if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry))
return false;
/*
* Allow non-NULL __upperdentry in inode even if upperdentry is NULL.
* This happens when finding a lower alias for a copied up hard link.
*/
if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry))
return false;
return true;
}
struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real,
bool is_upper)
{
struct inode *inode, *key = d_inode(real);
inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key);
if (!inode)
return NULL;
if (!ovl_verify_inode(inode, is_upper ? NULL : real,
is_upper ? real : NULL, false)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
return inode;
}
/*
* Does overlay inode need to be hashed by lower inode?
*/
static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper,
struct dentry *lower, struct dentry *index)
{
struct ovl_fs *ofs = sb->s_fs_info;
/* No, if pure upper */
if (!lower)
return false;
/* Yes, if already indexed */
if (index)
return true;
/* Yes, if won't be copied up */
if (!ofs->upper_mnt)
return true;
/* No, if lower hardlink is or will be broken on copy up */
if ((upper || !ovl_indexdir(sb)) &&
!d_is_dir(lower) && d_inode(lower)->i_nlink > 1)
return false;
/* No, if non-indexed upper with NFS export */
if (sb->s_export_op && upper)
return false;
/* Otherwise, hash by lower inode for fsnotify */
return true;
}
static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode,
struct inode *key)
{
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 18, 0)
return newinode ? inode_insert5(newinode, (unsigned long) key,
ovl_inode_test, ovl_inode_set, key) :
#else
return
#endif
iget5_locked(sb, (unsigned long) key,
ovl_inode_test, ovl_inode_set, key);
}
struct inode *ovl_get_inode(struct super_block *sb,
struct ovl_inode_params *oip)
{
struct dentry *upperdentry = oip->upperdentry;
struct ovl_path *lowerpath = oip->lowerpath;
struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL;
struct inode *inode;
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
bool is_dir;
unsigned long ino = 0;
if (!realinode)
realinode = d_inode(lowerdentry);
/*
* Copy up origin (lower) may exist for non-indexed upper, but we must
* not use lower as hash key if this is a broken hardlink.
*/
is_dir = S_ISDIR(realinode->i_mode);
if (upperdentry || bylower) {
struct inode *key = d_inode(bylower ? lowerdentry :
upperdentry);
unsigned int nlink = is_dir ? 1 : realinode->i_nlink;
inode = ovl_iget5(sb, oip->newinode, key);
if (!inode)
goto out_nomem;
if (!(inode->i_state & I_NEW)) {
/*
* Verify that the underlying files stored in the inode
* match those in the dentry.
*/
if (!ovl_verify_inode(inode, lowerdentry, upperdentry,
true)) {
iput(inode);
inode = ERR_PTR(-ESTALE);
goto out;
}
dput(upperdentry);
goto out;
}
/* Recalculate nlink for non-dir due to indexing */
if (!is_dir)
nlink = ovl_get_nlink(lowerdentry, upperdentry, nlink);
set_nlink(inode, nlink);
ino = key->i_ino;
} else {
/* Lower hardlink that will be broken on copy up */
inode = new_inode(sb);
if (!inode)
goto out_nomem;
}
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
ovl_inode_init(inode, upperdentry, lowerdentry);
if (upperdentry && ovl_is_impuredir(upperdentry))
ovl_set_flag(OVL_IMPURE, inode);
if (oip->index)
ovl_set_flag(OVL_INDEX, inode);
/* Check for non-merge dir that may have whiteouts */
if (is_dir) {
if (((upperdentry && lowerdentry) || oip->numlower > 1) ||
ovl_check_origin_xattr(upperdentry ?: lowerdentry)) {
ovl_set_flag(OVL_WHITEOUTS, inode);
}
}
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
out:
return inode;
out_nomem:
inode = ERR_PTR(-ENOMEM);
goto out;
}

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More