Compare commits

...

356 Commits

Author SHA1 Message Date
8f117cc0dc configure.ac: Update version number to 1.5.1-knl+hfi
Change-Id: Icbd08c9c5f65b22d007ec479a34acd20062e0e90
2019-05-14 17:22:33 +09:00
0b9a657a01 HFI: support IFS 10.8-0
Change-Id: Iebc0e2b50faf464efcc5134cc40dc52e0bd6eea7
2019-04-15 11:26:39 +09:00
c2d6651cd2 mcreboot: remove MCDRAM offline/online
Change-Id: Ia30180b4890508d041fc64ca35e1a9c58d903ddf
2019-04-15 11:26:39 +09:00
d979444049 file_ops: add missing break statement (harmless)
Change-Id: I97982c96623b571d94348fd4a3df6bb0aeb515e9
2018-07-26 05:06:16 +00:00
faa357d5a6 Merge "configure.ac: Update version number to 1.5.0-knl+hfi" into development+unimap+hfi+OFP 2018-06-21 02:39:43 +00:00
653aba17a1 mcreboot: load kernel modules from under /tmp
Change-Id: I81a8c451b6dd556a00699a2c8d0c7df5a99e4ea2
2018-06-20 20:53:00 +09:00
7736e25ca4 mpimcexec: fix empty ${COMMAND} check
Change-Id: I9e37e952fb756a4aafb4b2e218844120fe59af7b
2018-06-20 20:50:33 +09:00
73d16a9d79 configure.ac: Update version number to 1.5.0-knl+hfi
Change-Id: I9d36bcfe4b64a772f6492e39a1466a2e73ddd682
2018-06-20 17:07:30 +09:00
922bd7e6eb mpimcexec: use PJM_PROC_BY_NODE if available
Change-Id: Id8991f78e4d3bdfbb20adf202b43762a0d915c47
2018-06-20 15:18:53 +09:00
0d99072109 mpimcexec: man page proof-reading
Change-Id: I58223dd86e17fa896fe3e258d2dc2e5b881a0072
2018-06-18 16:31:42 +09:00
3ced3f6080 mcexec: Options -m and -M are described in man page
Change-Id: Ie4a860c8753af654ee842b16aabb9620e68f71a1
2018-06-18 15:00:29 +09:00
d9ff940528 mpimcexec: Man page
Change-Id: I99ea2821500cc1cfadc912d93c88d308b92ed9cf
2018-06-18 14:59:40 +09:00
cd63ec877d mpimcexec: Error handling is added
Change-Id: Id4e94adad2afff324b154d0c8be270ecc7568bab
2018-06-18 14:59:18 +09:00
6c0bb9e576 HFI1: Range-check proc->fd_priv_table[]
sockioctl01.c in LTP calls ioctl(1025, ...) and causes kernel page-fault without
the range-check.

Change-Id: I4117783e20107f274c0857b09745f12a5cc5ce2f
2018-06-13 00:31:44 +09:00
ca9894108b OFP: mpimcexec: use MPI_LOCALNRANKS for ppn if available 2018-06-13 00:31:44 +09:00
3f26e44f85 mremap: Don't premap destination vm_range
mremap works in the following steps:
(1) Unmap the destination memory area
(2) Create a new vm_range with add_process_memory_range
(3) Move the PTEs of the source range to the destination range by using move_pte_range

The problem is that step (3) expects the destination doesn't have any physical pages,
but step (2) premaps the destination when the optimization of premapping anonymous
map is turned on.

Change-Id: Ieeebd799b7169b9a6f6f658c204c31f49817030f
2018-06-13 00:31:44 +09:00
bacfb0c2b9 OFP: mpimcexec wrapper around mpirun for OFP users 2018-06-13 00:31:43 +09:00
09f63483cc OFP: temporary ANON mmap() rewrite 2018-06-13 00:31:43 +09:00
2f0c2aae9e OFP: avoid drop_caches in mcreboot 2018-06-13 00:31:43 +09:00
f7b277a623 HFI1: use ihk_mc_pt_lookup_fault_pte() in SDMA/exp receive 2018-06-13 00:31:43 +09:00
a3aa96af19 MM: introduction of ihk_mc_pt_lookup_fault_pte() 2018-06-13 00:31:43 +09:00
91d732308d HFI1: shorten lock held spin for SDMA status changes 2018-06-13 00:31:43 +09:00
166c6105ef queued_spin_lock: fix compatibility with Linux 2018-06-13 00:31:43 +09:00
5a2f8388a6 HFI1: handle Linux queued_spin_locks in the receive path as well 2018-06-13 00:31:42 +09:00
8164b63fc2 HFI1: port to IFS 10.7 rpv1 and support queued_spin_lock in Linux 3.10.0-693.11.6 2018-06-13 00:31:42 +09:00
af22ce62d2 HFI1: clean up and eliminate dead code in user SDMA 2018-06-13 00:31:42 +09:00
2eca75ead8 HFI1: clean up dead code in file ops 2018-06-13 00:31:42 +09:00
22992780cf HFI1: use kmalloc_cache_free() in clear_tid_node() for TID nodes 2018-06-13 00:31:42 +09:00
3043591e9a hfi1_user_exp_rcv_overlapping(): fix return value when overlapping 2018-06-13 00:31:42 +09:00
7e7c0f9ed3 init_process_vm(): remove vm_range_numa_policy_list (merge fix) 2018-06-13 00:31:42 +09:00
7193f165cc HFI1: fix page border iteration bug in hfi1_user_exp_rcv_setup() 2018-06-13 00:31:42 +09:00
c8c42576fd HFI1: increase lock timeout in sdma_send_txlist() 2018-06-13 00:31:42 +09:00
0412e1fcc6 HFI1: add generated user_sdma_request and user_sdma_txreq headers 2018-06-13 00:31:41 +09:00
238e346586 HFI1: use DWARF generated headers for user_sdma_request and user_sdma_txreq 2018-06-13 00:31:41 +09:00
0e57c715ad HFI1: look at DW_AT_upper_bound for resolving array size from DWARF info 2018-06-13 00:31:41 +09:00
3facd3dcca HFI1: release lock in sdma_send_txlist() when SDMA ring is full 2018-06-13 00:31:41 +09:00
ec5328de69 HFI1: refactor sdma_select_user_engine() 2018-06-13 00:31:41 +09:00
880dd6ddb2 page_fault_handler(): enable on-demand mapping of Linux ioremap area 2018-06-13 00:31:41 +09:00
898708b8b4 spinlock: rewrite spinlock to use Linux ticket head/tail format 2018-06-13 00:31:41 +09:00
b08331b21a ihk_hfi1_common.h: use IRQ restore unlock in spin_unlock 2018-06-13 00:31:41 +09:00
c196c996dd HFI: add dd to generated sdma_engine 2018-06-13 00:31:41 +09:00
20e179f6dc sdma_select_user_engine(): refactor selection code 2018-06-13 00:31:40 +09:00
32fbc015f5 HFI1: eliminate lots of dead code 2018-06-13 00:31:40 +09:00
558c250bb3 HFI1: generate headers for sdma_state and sdma_engine structures 2018-06-13 00:31:40 +09:00
96ea2d3658 dwarf-extract: support enumerations 2018-06-13 00:31:40 +09:00
9c91298ccf do_munmap(): hook to HFI1 deferred unmap 2018-06-13 00:31:40 +09:00
b08da83a51 hfi1_file_ioctl(): execute HFI1_IOCTL_TID_INVAL_READ locally 2018-06-13 00:31:40 +09:00
fcc8310454 HFI1: track receive TIDs in a tree 2018-06-13 00:31:40 +09:00
96b8b30516 MM: facility for deferred munmap()
Conflicts:
	kernel/process.c
2018-06-13 00:31:40 +09:00
521e0dc707 HFI1: add a bunch of fields to hfi1_devdata and hfi1_filedata for receive TID handling, do necessary mappings in hfi1_map_device_addresses() 2018-06-13 00:31:40 +09:00
e2e773d883 HFI: fix tidinfo and length calculation in program_rcvarray() 2018-06-13 00:31:39 +09:00
04d22d90a3 do_mmap(): debug message cosmetics 2018-06-13 00:31:39 +09:00
f6405081a6 page_fault_handler(): map Linux ioremap addresses on demand (disabled) 2018-06-13 00:31:39 +09:00
5bea237581 HFI1: make kmalloc caches per-CPU and pre-allocate at boot time 2018-06-13 00:31:39 +09:00
33ad55e72b kmalloc_cache_prealloc(): specify nr_elems as argument 2018-06-13 00:31:39 +09:00
6848c2ecf7 HFI1: move tid_rb_node to header 2018-06-13 00:31:39 +09:00
79f9a2d31a HFI1: don't print at open() time 2018-06-13 00:31:39 +09:00
2900ce20f7 HFI1: hfi1_unmap_device_addresses() at process terminate time 2018-06-13 00:31:39 +09:00
002b78372d open(): ignore /proc/sys/vm/overcommit_memory 2018-06-13 00:31:38 +09:00
5fce5e4e3c hfi1 generated headers: add missing filedata file 2018-06-13 00:31:38 +09:00
7a1ad31183 HFI: call hfi1_map_device_addresses() at initialization time
Conflicts:
	kernel/syscall.c
2018-06-13 00:31:38 +09:00
54bdb3419d hfi1 generated headers:
- split headers into one file per struct
 - add filedata
 - fix s/modprobe/modinfo/ for guessed .ko path
2018-06-13 00:31:38 +09:00
03fed4d1c8 automatically generate hfi structs from dwarf info 2018-06-13 00:31:38 +09:00
6279f69f5c compiler.h: take in recent linux updates for newer gcc support
Had to remove from original compiler-gcc:
 - things that deal with types, e.g. READ_ONCE macro and friends;
 - #define barrier(). This one would be better there at some point.

hfi1: remove ACCESS_ONCE from hfi1 header
2018-06-13 00:31:38 +09:00
6959d5ead4 HFI: port to SFI driver version 10.5.1.0.2 2018-06-13 00:31:38 +09:00
a5aa68744f hfi1: use kmalloc_cache for tid_rb_node allocations 2018-06-13 00:31:38 +09:00
89c5aaa9e9 hfi1_user_exp_rcv_setup(): rewrite main loop 2018-06-13 00:31:37 +09:00
15422d886f hif1_file_ioctl(): use dkprintf() 2018-06-13 00:31:37 +09:00
f139bef0cb mmap(): remove force large page extension (meant to be RESET) 2018-06-13 00:31:37 +09:00
de82cf8779 hfi1/user_exp_rcv/setup: keep track of position within page
ihk_mc_pt_lookup_pte + pte_get_phys will get us the physical address
for the start of the page we're looking at.
Re-offset it by position within buffer.
2018-06-13 00:31:37 +09:00
662895c020 hfi1/user_exp_rcv: explicitely call hfi1_map_device_addresses
There were cases where nobody else did this mapping for us
2018-06-13 00:31:37 +09:00
d23939da8c process/vm: fix lookup_process_memory_range (again)
That optimistically going left was a more serious bug than just
last iteration, we could just pass by a match and continue down
the tree if the match was not a leaf.

Fix the actual algorithm issue

Conflicts:
	kernel/process.c
2018-06-13 00:31:37 +09:00
67529f21ff hfi1: replace true/false defines by stddef include 2018-06-13 00:31:37 +09:00
5c11ff0950 process/vm: fix lookup_process_memory_range with small start address
Cherry-picked from 6370520e

Conflicts:
	kernel/process.c
2018-06-13 00:31:37 +09:00
ce4eb0d409 hfi1/user_exp_rcv/setup: add access_ok check 2018-06-13 00:31:36 +09:00
04434320fc hfi1/user_exp_rcv/setup: do not skip over pages
If the vaddr we consider is not at the start of a page, we could skip
over (smaller, not contigous) areas.

For example consider this segment of virtual memory:
[ 2MB | 4k | 4k | ... ]
Starting at 1MB offset, we would get a pgsize of 2MB so would skip
straight over 1MB worth of 4k pages.
2018-06-13 00:31:36 +09:00
50fafa6d71 hfi1/user_exp_rcv/setup: use cache_alloc for tidlist 2018-06-13 00:31:36 +09:00
f5ced648ef hfi1/user_exp_rcv: rework main loop
New loop now takes into account pages not physically contiguous.
Also some minor improvements, e.g. make the spin_lock used more locally,
reuse a group we had if we had one, etc.
2018-06-13 00:31:36 +09:00
0f8f88ca46 hfi1/user_exp_rcv/invalid: Remove function
user_exp_rcv_invalid is only used together with the mmu cache
(its purpose is the delayed freeing of tids that were invalidated in cache)

Since we do not use that cache, the function can go
2018-06-13 00:31:36 +09:00
e99f19e812 hfi1/user_exp_rcv/setup: set length in tidinfo
This was dropped early on by mistake/excessive haste, it's actually
pretty useful.
2018-06-13 00:31:36 +09:00
9a36e5d213 hfi1/user_exp_rcv/setup: increment phys appropriately
Old code was always registering the same section with different size,
instead of properly covering the requested map
2018-06-13 00:31:36 +09:00
4816f27639 hfi1/user_exp_rcv/setup: split into multiple tids
Do not round up to next power of two, but issue multiple requests
if necessary (e.g. 260k would be 256 + 4k in two registrations)
2018-06-13 00:31:36 +09:00
9c0b8aa812 mcctrl/control.c: fix debug print types 2018-06-13 00:31:36 +09:00
23f178d718 hfi1/user_exp_rcv/clear: implement TID_FREE ioctl 2018-06-13 00:31:36 +09:00
159c18b98b hfi1/ioctl: only forward ioctl if hfi1_file_ioctl didn't handle it
Conflicts:
	kernel/syscall.c
2018-06-13 00:31:35 +09:00
1847a3ac11 hfi1/user_exp_rcv/setup: cleanup locks/groups usage 2018-06-13 00:31:35 +09:00
15b16ffbbb hfi1/user_exp_rcv/setup: map is noop, skip it
In the original driver's dma.c hfi1_dma_map_single just passes
the physical address back, so directly use that.
2018-06-13 00:31:35 +09:00
e64d89cd48 hfi: bases for user_exp_rcv
This implements a skeleton setup function and call it on ioctl

Many missing points:
 - missing pci mapping to make setup work
 - no clear (passed to linux, so will likely bug out)
 - missing locks/safe-guards

Conflicts:
	kernel/Makefile.build.in
2018-06-13 00:31:35 +09:00
7366da4390 Fix other warnings
Most were harmless, but the change to ACCESS_ONCE from volatile
cast is probably useful.
Expanding macro, we basically went from:
    m = (volatile struct sdma_vl_map *)dd->sdma_map;
to
    m = *(volatile struct sdma_vl_map **)&(dd->sdma_map);
i.e. the explicit lookup is at a different level.
2018-06-13 00:31:35 +09:00
2dc85ee417 user_sdma: fix use of uninitialized variable (vl)
This defines a single field in hfi1_pportdata, getting offset
from dwarf headers -- need to compute that at configure time
2018-06-13 00:31:35 +09:00
73cc07f98e ioctl() investigation - TO RESET 2018-06-13 00:31:35 +09:00
815e2244ca HFI1: minor change of declarations 2018-06-13 00:31:34 +09:00
163af73554 HFI1: properly iterate iovecs according to underlying page sizes 2018-06-13 00:31:34 +09:00
fd316f3ca3 HFI1: pass per-CPU txreq_cache to user_sdma_send_pkts() 2018-06-13 00:31:34 +09:00
122588bc4d mcexec: --enable-hfi1 to runtime enable/disable HFI1 driver
Conflicts:
	executer/user/mcexec.c
2018-06-13 00:31:34 +09:00
70238982c2 HFI1: use embedded kmalloc cache for req->tids (fixes AllReduce hang) 2018-06-13 00:31:34 +09:00
5b5191ef64 HFI1: move txreq kmalloc cache header into CPU local variable 2018-06-13 00:31:34 +09:00
a65faeaed4 kmalloc cache: embed cache pointer into kmalloc_header
Conflicts:
	kernel/mem.c
2018-06-13 00:31:34 +09:00
4dea1842e0 kmalloc cache: embed cache pointer into kmalloc_header
Conflicts:
	kernel/mem.c
2018-06-13 00:31:34 +09:00
5353b11f90 HFI1: disable kmalloc cache for req->tids (AllReduce fails otherwise) 2018-06-13 00:31:34 +09:00
abdbf96254 HFI1: use process rank for SDMA engine selection 2018-06-13 00:31:33 +09:00
bd170e63ba kmalloc cache refactor and pre-alloc in HFI1 open() 2018-06-13 00:31:33 +09:00
d35fa16417 HFI1: more detailed profiling (disabled by default) 2018-06-13 00:31:33 +09:00
6406a0df6b HFI1: compute SDMA pkt length taking large pages into account 2018-06-13 00:31:33 +09:00
52e8f03b4b HFI1: store base physical address in iovec if physically contiguous 2018-06-13 00:31:33 +09:00
b071a3f32c HFI1: use fast_memcpy() in header fillings
Conflicts:
	kernel/user_sdma.c
2018-06-13 00:31:33 +09:00
90258f00bd HFI1: use generic kmalloc cache for user_sdma_txreqs and req tids 2018-06-13 00:31:33 +09:00
28eb649056 Generic lock-free kmalloc cache implementation
Conflicts:
	kernel/mem.c
2018-06-13 00:31:33 +09:00
744ebacf65 HFI1: more pre-allocation in txreq cache 2018-06-13 00:31:33 +09:00
62e438a0aa HFI1: do device ioremap() mappings in per-process fashion 2018-06-13 00:31:32 +09:00
5ac582a678 user_sdma_send_pkts(): unlikely() around slow path condition 2018-06-13 00:31:32 +09:00
51bc28acca sdma_select_user_engine(): hash on CPU number 2018-06-13 00:31:32 +09:00
c43654d69b user_sdma_send_pkts(): handle page sizes correctly 2018-06-13 00:31:32 +09:00
c1d2db6a73 fixed sdma_vl_map, just in case it will be used in the future 2018-06-13 00:31:32 +09:00
aeef55d1b0 kmalloc(): try to get from remote_free list when regular is empty 2018-06-13 00:31:32 +09:00
6e289e8d9f HFI1: txreq cache and profiling 2018-06-13 00:31:32 +09:00
3b5363c533 HFI1: use original length calculation in sdma_send_pkts()
Conflicts:
	kernel/include/hfi1/sdma.h
2018-06-13 00:31:32 +09:00
60f6862db2 HFI1: use local write if private data is present; fix lenght alignment 2018-06-13 00:31:31 +09:00
39deff4e10 HFI1: working but a bit slow 2018-06-13 00:31:31 +09:00
7f03c18d4d Real run test version (update_tail, kregbase+offset crash) 2018-06-13 00:31:31 +09:00
640dba627f Added debugging output. Bugfixes in user_sdma_send_pkts() and sdma_send_txreq(). 2018-06-13 00:31:31 +09:00
ae368d97d4 Implemented a replacement for sdma_txadd_page()
Conflicts:
	kernel/user_sdma.c
2018-06-13 00:31:31 +09:00
99c216d91e HFI1: fix kregbase/piobase types to avoid warnings 2018-06-13 00:31:31 +09:00
3c357dc30a HFI1: fix completion mapping 2018-06-13 00:31:31 +09:00
37866e61ab HFI1: map completion queues 2018-06-13 00:31:31 +09:00
076e6b9b12 Enabled _sdma_txadd_daddr() 2018-06-13 00:31:30 +09:00
fa6db686b4 Corrected spin_lock_irqsave() spin_unlock_irqrestore() definitions
Conflicts:
	kernel/include/hfi1/ihk_hfi1_common.h
2018-06-13 00:31:30 +09:00
74a636a612 Updated structs to use completion{} and wait_queue_head_t{} and added struct size checkes in hfi1_aio_write() 2018-06-13 00:31:30 +09:00
1c4a6568e6 Updated sdma.h (fixed struct sdma_engine size) 2018-06-13 00:31:30 +09:00
7d2e2f93b0 HFI1: map piobase and rcvarray_wc 2018-06-13 00:31:30 +09:00
7005110697 Updated and confirmed struct iowait{} and struct hfi1_user_sdma_pkt_q {}
Conflicts:
	kernel/include/hfi1/ihk_hfi1_common.h
2018-06-13 00:31:30 +09:00
c4ca4ae3ab Updated struct hfi1_devdata and confirmed its size 2018-06-13 00:31:30 +09:00
b024a486b9 Updated hfi1_filedata {} and confirmed its size against the original on Linux
Conflicts:
	kernel/include/hfi1/hfi.h
2018-06-13 00:31:30 +09:00
fe4c461f2f Updated kcalloc/kmalloc calls and enabled sdma_select_user_engine dependencies
Conflicts:
	kernel/include/hfi1/ihk_hfi1_common.h
2018-06-13 00:31:29 +09:00
b60a980088 hfi1_user_sdma_process_request(): map HFI1 kregbase 2018-06-13 00:31:29 +09:00
ec66229063 HFI1: adjust sdma_select_user_engine()
Conflicts:
	kernel/user_sdma.c
2018-06-13 00:31:29 +09:00
b875b5186f spinlock: make increment compatible with XPPSL Linux (v3.10) 2018-06-13 00:31:29 +09:00
5cf884ef41 Updated TODO tags and struct hfi1_user_sdma_pkt_q 2018-06-13 00:31:29 +09:00
64e2639adc * The relevant files have been modified in order to compile with McKernel.
Conflicts:
	kernel/Makefile.build.in
2018-06-13 00:31:29 +09:00
14b360e867 * Added the original files of the driver as a basis for comparison
Conflicts:
	kernel/include/hfi1/sdma.h
	kernel/sdma.c
	kernel/user_sdma.c
2018-06-13 00:31:29 +09:00
4a0e389953 HFI1: comments to keep in mind
Conflicts:
	kernel/include/hfi1/sdma.h
	kernel/sdma.c
	kernel/user_sdma.c
2018-06-13 00:31:28 +09:00
34363c2b68 close(): clear fd_priv_table 2018-06-13 00:31:28 +09:00
8a1d756cb1 Added private_data structure in process
Conflicts:
	executer/user/mcexec.c
	kernel/include/process.h
	kernel/process.c
2018-06-13 00:31:28 +09:00
e36abe57e7 open(): check on private_data for /dev/hfi 2018-06-13 00:31:28 +09:00
b2c8cc50dc open(): record private_data
Conflicts:
	kernel/syscall.c
2018-06-13 00:31:28 +09:00
b9b4a4fe36 search_free_space(): manage region->map_end internally
Cherry-pick of 87f72548a232a1626f2ca103da7f1ce62d139359

Conflicts:
	kernel/syscall.c
2018-06-13 00:31:28 +09:00
4b652c9353 atobytes(): restore postfix before return 2018-06-13 00:31:28 +09:00
60ac94cbb9 process/vm/access_ok: fix edge checks.
Add check for start/end being larger than the range we're checking.
Fix corner case where the access_check() was done on last vm range, and
we would be looking beyond last element (null deref)
2018-06-13 00:31:28 +09:00
42bbf5f2a4 process/vm: implement access_ok() 2018-06-13 00:31:27 +09:00
e29a40331d partitioned execution: pass process rank to LWK
Cherry-pick of d2d134d5e6a4b16a34d55d31b14614a2a91ecf47

Conflicts:
	kernel/include/process.h
2018-06-13 00:31:27 +09:00
655de2cd82 ihk_mc_get_linux_kernel_pgt(): add declaration
Cherry-pick of caff967a442907dd75f8cd878b9f2ea7608c77b2
2018-06-13 00:31:27 +09:00
205747594b Exclude areas not assigned to Mckernel from direct map of all phys. memory
It's enabled by adding -s to mcreboot.sh.

Cherry-pick of the following commit:

commit b5c13ce51a5a4926c2cf11c817cd0d369ac4402d
Author: Katsuya Horigome <katsuya.horigome.rj@ps.hitachi-solutions.com>
Date:   Mon Nov 20 09:40:41 2017 +0900

    Include measures to prevent memory destruction on Linux side (This is rebase commit for merging to development+hfi)
2018-06-13 00:31:27 +09:00
21f9a1ea33 eclair: fix MAP_KERNEL_START and apply Fujitsu's proposals
(1) Cherry-pick of 644afd8b45fc253ad7b90849e99aae354bac5b17
(2) Pass length to functions with arguments of variable length
    * POSTK_DEBUG_ARCH_DEP_38
(3) Separate architecture dependent functions/structures
    * POSTK_DEBUG_ARCH_DEP_34
(4) Fix include path
    * POSTK_DEBUG_ARCH_DEP_76
(5) Include config.h
    * POSTK_DEBUG_ARCH_DEP_33
2018-06-13 00:31:27 +09:00
aed099fbcb kmalloc_header: use signed integer for target CPU id
Cherry-pick of bdb2d4d8fa94f9c0268cdfdb21af1a2a5c2bcae5
2018-06-13 00:31:27 +09:00
48515970a0 ihk_mc_get_processor_id(): return -1 for non-McKernel CPUs
Cherry-pick of c45641e97add9fde467844d9272f2626cf4317de
2018-06-13 00:31:27 +09:00
b888f31b30 Map LWK TEXT to the end of Linux modules section (0xFFFFFFFFFE800000)
Cherry-pick of b9827b25883a9622058cb78006e705f09eaf9a84
2018-06-13 00:31:27 +09:00
7982008b5b virt_to_phys(): fix debug messages
Cherry-pick of 46eb3b73dac75b28ead62476f017ad0f29ec4b0a
2018-06-13 00:31:26 +09:00
f658173269 init_normal_area(): fix mapping start physical address
Cherry-pick of 2d3006818473af50c38a3d0e33595b4e74588004
2018-06-13 00:31:26 +09:00
ca7edf1df8 mem: make McKernel kernel heap virtual addresses Linux compatible
Cherry-pick of e5334c646d2dc6fb11d419918d8139a0de583fde
2018-06-13 00:31:26 +09:00
9a5f3ad4e6 mem: map Linux kernel virtual addresses properly
Cherry-pick of 5f37e846c3d70e5d5c0baea5b8eb8ceee3411c88
2018-06-13 00:31:26 +09:00
cfbab0ee82 move McKernel out of Linux kernel virtual
Cherry-pick of 88a8277f17da62d349b4340b66d37482344db649
2018-06-13 00:31:26 +09:00
86ae1380e4 configure.ac: Move man directory to share/man
Change-Id: Idaa5c0f61fbbe3bda4697bc59487f562e09ff2d6
2018-06-11 13:13:13 +09:00
9bb48186e6 add testcases for #732 #1065 #1102 2018-06-07 10:11:23 +09:00
139123dc12 move test programs 2018-06-07 10:08:48 +09:00
6602cf442c add test cases 2018-06-07 10:04:33 +09:00
f148863586 pager_req_map(): do not take mmap_sem if not needed 2018-06-07 07:17:41 +09:00
ec375da27a pager_req_create(): prefetch libiomp, libpthread and libc 2018-06-07 07:17:31 +09:00
c50e7c1029 prepare_process_ranges_args_envs(): fix saving cmdline 2018-06-07 07:17:21 +09:00
5f4dbb2c71 mprotect: Fix early exit condition on page table attribute 2018-06-06 01:39:44 +09:00
328609269b Clean up "Detect hang of McKernel in mcexec"
* Clean up error checks
2018-06-01 14:51:07 +09:00
056fdb2633 Fix "Detect hang of McKernel in mcexec"
1. Call exit() when detecting hang
2. Clean up error checks
2018-06-01 14:21:19 +09:00
09d0a59e22 Detect hang of McKernel in mcexec
mcexec spawns a thread which detects hang of McKernel by using
ihk_os_get_eventfd().

Change-Id: I6cf0ee0c1f0c2c31a8422224b2105f64a9b9ab93
2018-06-01 10:44:34 +09:00
511555c8cb fix: /proc/<PID>/maps outputs a unnecessary NULL character 2018-05-30 16:38:28 +09:00
81699345cc mprotect: do not set page table writable for cow pages
Change-Id: If8b0bb56e7dae59aa9dc3d745a4cc4e43bf4bf9a
2018-05-30 13:29:55 +09:00
130751ff66 fileobj: avoid memory leak in path recording 2018-05-14 17:46:52 +09:00
f3d18eb9de fileobj/devobj: record path name (originally by Takagi-san) 2018-05-14 17:46:52 +09:00
249bda4aef fileobj: use MCS locks for per-file page hash 2018-05-14 17:46:52 +09:00
aaa246f86f mcexec: change debug printf macros to be more tolerant to trivial format
Enabling DEBUG fails to compile. It'd be easy to fix the dprintf to dprint
but this is just as generic and we can now use dprintf everywhere
2018-05-11 09:23:46 +09:00
c52f7a5b49 syscall wait4: add _WALL (POSTK_DEBUG_ARCH_DEP_44)
Needed by strace -f
2018-05-11 09:22:54 +09:00
90a34f54c9 mcreboot.sh,mcstop+release.sh: Disable irqbalance_mck forcefully 2018-04-26 15:06:53 +09:00
bfb5080b71 pager_req_unmap: Put per-process data at exit 2018-04-10 11:35:03 +09:00
641dfed37e configure.ac: Update version number 2018-04-06 09:14:27 +09:00
4572e6be3f fix mcctrl SMAP - everyone needs copy_to_user 2018-04-03 10:38:44 +09:00
12e44050c9 mcexec: drop READ_IMPLIES_EXEC from personality to avoid device file mapping failure 2018-04-02 20:12:54 +09:00
d5190990f5 mcreboot.sh,mcstop+release.sh: rm -rf /tmp/mcreboot when it's done 2018-03-27 23:25:44 +09:00
82822b1f16 mcreboot.sh: Fix error cases
(1) Restart irqbalance when error occurs after it's stopped
(2) Restore /proc/irq/*/smp_affinity when error occurs after
    they're modified
2018-03-27 22:20:25 +09:00
7f02889f76 mcreboot.sh,mcstop+release.sh: Save /proc/irq/*/smp_affinity to /tmp/mcreboot 2018-03-27 22:01:55 +09:00
9dc86869d8 test: Modify mng_mod/{863,870}/README 2018-03-27 19:36:07 +09:00
02bb127007 test: Modify mng_mod/*/README 2018-03-27 14:53:29 +09:00
c26c4aba4f test: Modify mng_mod/{863,870} 2018-03-13 10:24:52 +09:00
e8d8ad60c2 Modify README files of test/mng_mod/{863,870,882} 2018-03-13 05:04:06 +09:00
a7f645f7df terminate(): fix update_lock and threads_lock order to avoid deadlock 2018-03-25 08:29:53 +09:00
73731d2a0d ihk_mc_map/unmap_virtual(): do proper TLB invalidation 2018-03-24 07:58:08 +09:00
0f049c5ed7 Modify README of #863 and #870 2018-03-12 17:13:16 +09:00
8d5f95de04 schedule: Add comment on #1029
refs #1029
2018-03-12 17:11:20 +09:00
88fca2c0df issue/{863, 870}/README: update test items 2018-03-23 16:08:17 +09:00
81d18e35dd rename files 2018-03-23 15:35:24 +09:00
309da8fc53 issue/863: add 8 testcases 2018-03-23 14:48:18 +09:00
535e3f3af6 issue/863/CT300x: add timestamp and check 2018-03-23 13:28:19 +09:00
4c80dca479 issue/863/README: add how to execute stress_test 2018-03-23 12:26:13 +09:00
7bef1f5117 Remove debug-print from do_syscall() 2018-03-12 02:07:12 +09:00
bb8c8355c2 small fix: testcases for #1032, #1033, #1034 2018-03-19 16:28:18 +09:00
fab0641813 prepare_process_ranges_args_envs(): fix generating saved_cmdline to avoid PF in strlen() 2018-03-19 13:56:04 +09:00
ce3af4734a fix: dual hold_thread() in do_kill() 2018-03-19 11:12:50 +09:00
e2dea4e9f8 mcexec_start_image(): handle IKC send timeout 2018-03-17 21:33:17 +09:00
0d9c1df75a update: testcases and result for #1032, #1033, #1034 2018-03-16 11:14:29 +09:00
6a979cf4b8 add: testcases for #1032, #1033, #1034 2018-03-15 14:31:29 +09:00
c107d1fdf9 fix: Bug for measuring rss in fork()
refs: #1032
2018-03-15 14:29:16 +09:00
bc89a51e00 fix: getrusage's u|stime race-condition caused by release_thread() and getrusage() 2018-03-15 14:26:39 +09:00
9da9e755fa Issue#923: add test cases 2018-03-15 10:13:16 +09:00
fe42481d6f Add allow_oversubscribe kernel argument
It's not allowed in the default setting.
Execute mcreboot.sh with -O option to allow it.

refs #1072
2018-03-10 13:08:38 +09:00
b1ea6eb82a procfs: Show Linux /proc/self/cgroup
Support the case where McKernel process retrieves its job-id when running under
the Fujitsu TCS suite.
2018-03-10 11:58:45 +09:00
8c2e20c3aa uti: Fix uti thread on the McKernel side blocks others in do_syscall()
It could block other threads on the same CPU in do_syscall() since it busy-waits after woken up
because it's not allowed to sleep again.
2018-03-09 18:02:45 +09:00
65667709a8 Fix thread status race-condition caused by hold_thread() in do_kill() and terminate()
Conflicts:
	arch/x86_64/kernel/syscall.c
	kernel/syscall.c
2018-03-09 17:53:17 +09:00
51bc5fd61f uti: Fix wrong argument passed to ihk_ikc_release_packet() in mcexec_terminate_thread()
Conflicts:
	executer/kernel/mcctrl/control.c
2018-03-09 17:44:30 +09:00
3b277b2354 uti: Fix dead-lock of calling terminate() from terminate()
Conflicts:
	arch/x86_64/kernel/syscall.c
	kernel/syscall.c
2018-03-09 17:38:55 +09:00
3e4c9bdd90 Fix lock of struct wait_queue_head_list_node 2018-03-09 17:31:10 +09:00
06b1b4f8ab Fix deadlock on thread->times_update in getrusage()
Set thread->in_kernel properly on exiting interrupt handler when entering
it from kernel mode.

Conflicts:
	arch/x86_64/kernel/cpu.c
	kernel/mem.c
2018-03-09 17:26:31 +09:00
7b4de6e6c2 mcstat: Clean-up Makefile.in 2018-03-09 14:36:01 +09:00
1c266f4849 mcstat: Fix build error 2018-03-09 14:31:07 +09:00
b7a7281195 fix: Bug for getrusage often return incorrect ru_stime
refs #1034
2018-03-07 13:11:37 +09:00
b77732fb4f fix: Bug for getrusage(RUSAGE_CHILDREN) return parent info (POSTK_DEBUG_TEMP_FIX_72)
refs #1033
2018-03-07 13:10:45 +09:00
a224bf648a fix: Bug for getrusage return incorrect ru_maxrss
refs #1032
2018-03-07 13:09:24 +09:00
642520f80c rus_vm_fault: If page fault occurs in a thread that has not processed system call offloading, incorrectly return to normal.
refs #923
2018-03-07 10:22:47 +09:00
5cb75b00c7 mcexec_destroy_per_process_data: System calls delegation can not be terminated in error when the last process that closed /dev/mcos0 is a child process.
refs #882
2018-03-07 09:11:37 +09:00
7dd0d1137f revert for fix git message
This reverts commit 840acd6021.
2018-03-07 09:09:28 +09:00
cb2fe29f06 fix build error 2018-03-05 10:57:10 +09:00
3432f46d8b fix & add: testcases for refs #885, refs #1031 2018-03-01 15:41:58 +09:00
afcf1a24aa add: testcases for refs #885, refs #1031 2018-03-01 10:24:21 +09:00
140f813d77 fix: differences in behavior of sigaction between Linux and Mckernel 2018-03-01 09:44:44 +09:00
7ad6f9595c fix: bug for ptrace_attach self pid 2018-03-01 09:37:12 +09:00
1796c20b88 A bug for not installing mcstat is fixed. 2018-02-25 11:46:16 +09:00
0da5b76916 Merge branch 'development' of postpeta.pccluster.org:mckernel into development 2018-02-25 11:03:13 +09:00
4ac1efae6c - mcstat is a tool to report McKernel statistics from Linux side.
This is a response to a CEA's request.
	- The tools directory is created under the mckernel directory.
	- Some include files are now installed in the install directory,
	  but we should rethink of it.
2018-02-25 10:57:28 +09:00
523a066245 sigaction: support for SA_RESETHAND on x86_64
refs #1031
2018-02-22 11:55:32 +09:00
98df469d29 Issue#882: add test cases 2018-02-22 11:42:43 +09:00
f46287a711 ptrace: support for attaching child_process to parent
refs #885
2018-02-22 09:47:59 +09:00
c260b5c6f3 xpmem: support for fork()
refs #925
2018-02-22 09:37:48 +09:00
c9157f273f do_fork: If mcexec succeeds for fork and McKernel fails fork, the child process of mcexec will remain. 2018-02-14 16:37:38 +09:00
840acd6021 mcexec_destroy_per_process_data: System calls delegation can not be terminated in error when the last process that closed /dev/mcos0 is a child process.
refs #822
2018-02-14 16:34:08 +09:00
c949a894c6 Remove unnecessary files commited by mistake. 2018-02-06 10:43:21 +09:00
228f8f8533 Wait for LWK to run at shutdown.
refs #898
refs #928
2018-02-06 10:40:12 +09:00
8ee9eca74e issue 863: add test cases and test evidences 2018-02-05 16:07:00 +09:00
748429fc92 do_generic_syscall: Even if the system call is normal, if errno is not zero, it returns an error. (TEMP_FIX_75) 2018-02-03 21:37:12 +09:00
a9dfcd9a89 translate_rva_to_rpa(): use 2MB blocks in 1GB pages on x86 2018-01-31 11:16:44 +09:00
559fc9746c signal: check_signal must be called after check_need_resched. 2018-01-28 13:38:51 +09:00
54169bc3ea procfs: indicate heap in /proc/maps 2018-01-26 16:22:43 +09:00
142e923222 procfs: indicate VDSO, vsyscall and stack in /proc/maps 2018-01-26 16:02:32 +09:00
86efc86945 save_syscall_return_value(): separate from check_signal() and call from syscall() (for ARM64) 2018-01-26 14:43:18 +09:00
ebaafa95d8 settid(): clear syscal offload request before populating 2018-01-26 13:54:34 +09:00
b8ee144e67 do_fork(): return -ENOMEM when no more TIDs available 2018-01-26 13:53:05 +09:00
722ae0e7d5 ARM64 arch_clone_thread(): eliminate extra save_fp_regs() 2018-01-26 13:51:38 +09:00
f56e087208 init_process_stack(): fix stack alignment (align to 64 bytes) 2018-01-26 13:43:23 +09:00
f55f01cc11 signal: If the thread receiving the signal is not current, the signal is not processed. 2018-01-25 22:27:34 +09:00
1fa398cfab do_kill: fix to initialization leakage 2018-01-24 23:11:18 +09:00
8123cc413e Use version string in configure.ac when git repo is not found 2018-01-24 00:52:18 +09:00
d4459cf9f3 Add check to confirm IHK and McKernel with the same version are used 2018-01-24 00:20:57 +09:00
4bb65494e9 signal: When the process receives a termination signal, it first terminates mcexec.
refs #863
refs #870
2018-01-23 14:40:38 +09:00
2f2b3cdc6f signal: interrupt_syscall is called by the core executing the thread that recieved the signal.
refs #999
2018-01-23 14:31:04 +09:00
1e9f9d9809 update Test for Issue#1029 2018-01-14 14:58:19 +09:00
1b25379c02 small fix: reset switch_ctx flag in schedule() for redo 2018-01-14 14:50:31 +09:00
38bbb4e390 add Test programs for Issue#1029 2018-01-10 11:22:05 +09:00
0fa88f513f fix broken files 2017-12-27 15:28:13 +09:00
cd54c5983a fix openat 2017-12-27 14:59:13 +09:00
6084faeecd make McKernel's execve behave same as Linux when argv or envp is set to NULL (fix for TEMP_FIX_21) 2017-12-26 17:43:17 +09:00
d209c00a30 part of Issue#994
mcexec: open syscall moves to arch_dep
do_fork: don't use __NR_fork. use __NR_clone
vfork: moves to arch_dep
2017-12-26 10:30:33 +09:00
9a5d5feb9c time(): Split into architecture dependent functions
This fixes the bug reported as POSTK_ARCH_DEP_13 and POSTK_DEBUG_ARCH_DEP_13.
2017-12-23 11:36:52 +09:00
0cda763f95 fix /proc/*/pagemap
refs #387
2017-12-25 16:08:51 +09:00
cc7be46b7d make sure to context-switch to idle thread when therad's status is PS_EXITED
refs #1029
2017-12-25 13:32:42 +09:00
589504dc33 mcreboot: -h to indicate halting CPU in idle threads (e.g., in futex_wait()) 2017-12-18 11:22:15 +09:00
bf2f38051b mcreboot-smp: offline/online MCDRAM in one go 2017-12-06 14:41:25 +09:00
2d2d0af6fb add test for Issue#873, 1011 2017-11-29 12:23:20 +09:00
7f47dc78a1 add Issue#727 test cases 2017-11-29 11:32:40 +09:00
c3c9187ed5 add test for portability (kahansei_kojo in dev_V) 2017-11-28 17:55:23 +09:00
aebacb243e User Space:swapout (this is a rebase commit to merge into development) 2017-11-28 09:16:00 +09:00
5a8d1f09e8 add test/dump/README 2017-11-27 19:39:16 +09:00
0e10b6d1ee test/strace: Fix permission 2017-11-22 06:31:32 +09:00
d649d6fc2d Include mbind support (this is a rebase commit to merge into development) 2017-11-27 11:16:53 +09:00
bad487cc07 add regression test result for strace 2017-11-25 18:30:51 +09:00
3b6056fb1a add strace test cases and test result 2017-11-25 17:37:10 +09:00
5cc738d6bd add test programs for strace 2017-11-25 14:35:17 +09:00
c9fa445f54 Merge branch 'development' of pccluster.org:mckernel into development 2017-11-22 10:53:33 +09:00
d273a2f58b add strace bundled test cases 2017-11-22 10:52:30 +09:00
4e7069d499 add: proc|sys fs format_checker (tool) 2017-11-22 09:39:48 +09:00
66f44e77af mcstop+release.sh: Allow ihkmond to flush kmsg buffer 2017-11-20 18:28:48 +09:00
35f908b75c mcexec: protect against incorrect partitioned execution argument (-n) using timeouts 2017-11-20 17:06:01 +09:00
2f0089dfb9 mcstop+release: use ihkconfig release mem all 2017-11-20 17:06:01 +09:00
2af6d5115a fix: depending arch futex_atomic_op_inuser() (a part of ARCH_DEP_8) 2017-11-20 16:42:47 +09:00
ac25c5e1e7 fix: depending arch in Makefile (POSTK_DEBUG_ARCH_DEP_1) 2017-11-20 14:45:18 +09:00
90c0355d90 add setting process of pgshift to remap_process_memory_range
refs #955
2017-11-20 14:17:03 +09:00
43230eb623 fix: checking the return code of fork() in Linux.
refs #906
2017-11-15 15:46:47 +09:00
f18dc8428d fix: error code of perf_event_open, when unsupported event is specified.
refs #1030
2017-11-15 12:49:56 +09:00
ab53c8e0a4 execve: fix memory leak
refs #727
2017-11-09 16:44:31 +09:00
6c33e236d7 mcreboot: Fix umask for /proc and /sys files 2017-10-27 04:57:44 +09:00
85d36f1469 mcexec: check kernel version <= 3.10 for RHEL mcoverlayfs 2017-10-31 13:39:31 +09:00
0ecf31d896 modify:User space memory access(arm64) 2017-10-24 10:29:11 +09:00
08a625cc0d modify:User space memory access
perf_event_open,futex,process_vm_readv,process_vm_writev,move_pages
2017-10-23 20:27:56 +09:00
12840601e1 support PERF_TYPE_{HARDWARE|HW_CACHE} in perf_event_open
refs #829
2017-10-20 23:10:20 +09:00
2ae6883a8b mcreboot.sh, mcstop+release.sh: Fix retry loop of shutdown 2017-10-19 01:54:46 +09:00
d5629606c5 mcexec: -m: interpret as numactl -m (i.e., MPOL_BIND)
Conflicts:
	executer/include/uprotocol.h
	executer/user/mcexec.c
	kernel/include/syscall.h
2017-10-18 16:54:34 +09:00
285059e504 mcexec: use -M for --mpol-threshold
Conflicts:
	executer/user/mcexec.c
2017-10-18 16:44:49 +09:00
5b6d0a887c Add ARM64 arch_rusage header 2017-10-18 09:23:08 +09:00
3573b8649e Guard call to gencore and freecore
The gencore() and freecore() code in gencore.c is guarded by
POSTK_DEBUG_ARCH_DEP_18, so the call to these functions should
also be guarded, otherwise linking fails.
2017-10-18 09:20:52 +09:00
d7523cdd84 Remove assignment of ns_per_tsc in struct monitor
struct member seems to have been removed or moved to struct
global_rusage
2017-10-18 09:20:52 +09:00
5753db5846 Add ihk_mc_syscall_number() for ARM by reading x8 2017-10-18 09:20:52 +09:00
2d7cb0af89 Add copy_fp_regs to ARM (same as for x86_64) 2017-10-18 09:20:52 +09:00
1cb9b435a9 Fix (?) build system
- disable -mno-red-zone for ARM
- add missing INCLUDEDIR
- make gencore.c compile
2017-10-18 09:20:52 +09:00
43ecf06e83 arch: x86 -> x86_64 and build system changes 2017-10-18 09:20:52 +09:00
51982de36b Handle return value of mcctrl_ikc_send in mcexec_handle_prepare_image 2017-10-18 09:20:51 +09:00
0a22320a3c Don't allocate memory for 0-page-sized requests
Previously the allocator would return all availble memory for a
request of 0 pages. This is rather counter-intuitive and left no
memory for subsequent allocations.
2017-10-18 09:20:51 +09:00
8813e890c5 Fix the check routine for elf sections 2017-10-18 09:20:51 +09:00
e664ffba18 Show context registers at the interrupt by SGI 6 2017-10-18 09:20:51 +09:00
3bd0137c25 Fix some race condition on arm64
* move barrier() to architecture depended region
* add barrier() in issue_ipi, kprintf, map_virtual
* enable the workaround for cavium thunderx
2017-10-18 09:20:51 +09:00
4f2b4aa402 Round the allocation for cpu-local variables up PAGE_SIZE
Previously, this resulted in 0 pages being allocated.
2017-10-18 09:20:51 +09:00
682cd34b74 Make mcstop+release architecture independent 2017-10-18 09:20:51 +09:00
2bc4d06a48 Add empty definition of visit_pte_range_safe()
This is for linking only. visit_pte_range_safe() is required only
for memdump, as far as i can tell. Since memdump is disabled anyway
I think it's ok to leave this function empty for now.
2017-10-18 09:20:51 +09:00
4f2c1e07c1 Add ARCH variable to Makefiles
In some Makefiles the ARCH variable was not set, although it was used.
In executer/user/Makefile.in it was used before it was set.
2017-10-18 09:20:50 +09:00
77bb3038d3 Add PT_ENTRIES macro 2017-10-18 09:20:50 +09:00
931448a94d Fix typo in page_align_up 2017-10-18 09:20:50 +09:00
c51bbbabc6 Change x86 to @ARCH@ in mcreboot-smp-x86.sh.in
since it is used for smp-x86 and smp-arm64
2017-10-18 09:20:50 +09:00
2ddc52e1a4 setitimer(): Fix error handling of copy_from_user()
This fixes POSTK_TEMP_FIX_40 (POSTK_DEBUG_TEMP_FIX_40)
2017-10-13 04:59:50 +09:00
3c93958c48 extend_process_region(): fix align_shift (POSTK_DEBUG_TEMP_FIX_68) 2017-10-17 15:07:57 +09:00
9763c40f64 set_robust_list: returns 0
refs #977
2017-10-16 09:54:23 +09:00
3bf77446cc mcreboot-smp-x86.sh: add extra_kopts param
This lets one specify arbitrary kernel parameters, instead of manually
fiddling with the script.
Could ultimately replace params like -t (turbo) and -d (dump_level) that
do not have any side effect (logmode starts a userland daemon)
2017-10-13 10:02:11 +09:00
c3dfb1663d page_fault_handler: do not try to fault addresses < 4k
There is no good reason to map these low addresses (userspace could with
mmap fixed, but that is grounds for many exploits...);

the main advantage however is if we do a null deref or close to (0->foo)
within a pagefault we will get a panic stack instead of getting a hang
because we cannot get some locks.
2017-10-13 10:02:11 +09:00
217dd9c1e5 x86 set_signal: panic if interrupt came from kernel
This makes debugging errors e.g. FPE from kernel much easier,
we really shouldn't be taking a user level coredump blaming user
in that case anyway
2017-10-13 10:02:11 +09:00
d4cd756a91 x86/cpu.c: unhandled page fault: print pre-fault stack
Do basic manual unwinding and print raw stack addresses, with a
suggested invocation of addr2line to pretty-print the result.
2017-10-13 10:02:11 +09:00
b894619d1b Speed up parallel builds
- make should be $(MAKE)
 - add + in front of rules spawning long-lasted make process in a
subshell. (This would not be needed with $(MAKE) -C .. target, but our
makefiles do not handle that because they use $(PWD))
 - split the main 'all' rule as all 4 targets are independant
 - fix dependencies where appropriate for parallelism

Extra, not speed-related changes:
 - remove some double-colon for targets as they do not need it

This cuts build time from 5s to 1.5s on a laptop with -j4, and more
importantly from 85s to 35s on a KNL node.
As a bonus, the fixed dependencies removes the need to clean before
rebuilding all the time. Probably.
2017-10-13 10:02:11 +09:00
b962da700b do_signal: ignore SIGWINCH
McKernel would terminate() running program on terminal resizing
It actually looks like there is nothing for us to do when we
get that anyway (tested with `dialog`)
2017-10-13 10:02:11 +09:00
196379854b Fix a few more harmless compiler warnings:
- myfree in pager.c was called with an argument, so add one to the
dummy definition
- pgoff is offset_t (unsigned) and doesn't need to be compared to 0
- clang says '*(int *)0 = 0' will be optimized away instead of keeping
the segfault without a volatile hint (?! that is wrong!), but it causes
no harm to add anyway.
2017-10-13 10:02:11 +09:00
d213efac79 mcctrl/sysfs: add parenthesis around SYSFS_UNLINK_KEEP_ANCESTOR check
! has more priority than &, so !flags & SYSFS_UNLINK_KEEP_ANCESTOR is
not very likely. Change to !(flags & SYSFS_UNLINK_KEEP_ANCESTOR)
2017-10-13 10:02:11 +09:00
38910fe13d mc_perf_event.h: s/EVNET/EVENT/ in the guard (improper ifndef) 2017-10-13 10:02:11 +09:00
4d4279121b process/vm; replace vm_range list by a rbtree
This replaces the chained list used to keep track of all memory ranges
of a process by a standard rbtree (no need of interval tree here
because there is no overlap)

Accesses that were done directly through vm_range_list before were
replaced by lookup_process_memory_range, even full list scan (e.g.
coredump).
The full scans will thus be less efficient because calls to rb_next()
will not be inlined, but these are rarer calls that can probably afford
this compared to code simplicity.

The only reference to the actual backing structure left outside of
process.c is a call to rb_erase in xpmem_free_process_memory_range.

v2: fix lookup_process_memory_range with small start address

v3: make vm_range_insert error out properly

Panic does not lead to easy debug, all error paths
are handled to just return someting on error

v4: fix lookup_process_memory_range (again)

That optimistically going left was a more serious bug than just
last iteration, we could just pass by a match and continue down
the tree if the match was not a leaf.

v5: some users actually needed leftmost match, so restore behavior
without the breakage (hopefully)
2017-10-13 10:00:27 +09:00
99da5b6484 ptrace: unify flags PT_TRACE_SYSCALL_ENTER and PT_TRACE_SYSCALL_EXIT to PT_TRACE_SYSCALL
refs #961
2017-10-11 15:43:57 +09:00
6b60dee890 ihklib: Fix ihklib_rusage.h for x86 2017-10-04 05:06:17 +09:00
dd08a3151e mcreboot: Fix version check for mcoverlayfs 2017-10-04 00:37:01 +09:00
e1442bf12b mcexec: Fix usage 2017-10-03 15:34:00 +09:00
86f297ddc4 mcreboot: Fix change umask for /proc and /sys files 2017-10-03 15:21:44 +09:00
823b222af9 mcreboot: Change umask for /proc and /sys files 2017-10-03 06:03:44 +09:00
9c25eb8ef2 mcoverlayfs: Fix version check 2017-10-02 19:51:30 +09:00
665eead78b do_wait: delegate process status for ppid_parent if child process is teacee
refs #946
2017-09-29 14:59:34 +09:00
f8ef43c77d Merge branch 'development' of pccluster.org:mckernel into development 2017-09-29 14:59:10 +09:00
8f4afe410f Remove obsolete pc_init(), pc_ap_init(), pc_test() 2017-09-29 13:20:01 +09:00
da9bb421cc ptrace: call ptrace_syscall_exit before check_signal
refs #960
2017-09-29 10:03:44 +09:00
1e89796d3e Replace ihk_set_kmsg() with ihk_get_kmsg_buf() 2017-09-27 20:26:23 +09:00
a1a2900606 ptrace: Fix the timing of save_fp_regs, and Add copy fp_regs to child in clone_thread
refs #702
2017-09-27 17:02:30 +09:00
79b977ac06 Check xgetbv availability before use for machines without it (i.e. KVM) 2017-09-26 19:31:34 +09:00
37e3118df6 mcexec: Add --stack-premap=<premap_size>[,<max>] to man page 2017-09-26 18:45:52 +09:00
be4d84c0c1 mcexec: Add --stack-premap=<premap_size>[,<max>]
<premap_size> of stack is pre-mapped on creating a process.
And its max size of stack is set to <max>.
This replaces MCKERNEL_RLIMIT_STACK=<premap_size>,<max>.
2017-09-26 17:04:10 +09:00
c43c1b640a execve: call ptrace_syscall_exit if execve successed
refs #945
2017-09-26 14:31:07 +09:00
e294db7e53 syscall: set syscall_return before calling ptrace_syscall_exit
refs #944
2017-09-26 14:29:02 +09:00
df3f388e09 syscall: set -ENOSYS to syscall_return before calling ptrace_syscall_enter
refs #943
2017-09-26 14:25:49 +09:00
a2fbe99b60 madvise: support MADV_DONTDUMP/DODUMP
refs #661
2017-09-26 14:21:40 +09:00
9c847c0a8f Change permission of mcoverlay-create/destroy.sh from 600 to 755 2017-09-26 14:05:54 +09:00
58c1fd4512 Update test programs for qlmpi (do swap with using shared memory, ib_pingpong) 2017-09-25 16:56:52 +09:00
dae9a5ff13 mcexec: verify argument for -n/-t/-c 2017-09-25 16:43:47 +09:00
687 changed files with 41370 additions and 2298 deletions

1
.gitignore vendored
View File

@ -14,3 +14,4 @@ elfboot/elfboot_test
linux/executer/mcexec
linux/mod_test*
linux/target
kernel/script/dwarf-extract-struct

View File

@ -1,16 +1,25 @@
TARGET = @TARGET@
SBINDIR = @SBINDIR@
BINDIR = @BINDIR@
INCDIR = @INCDIR@
ETCDIR = @ETCDIR@
MANDIR = @MANDIR@
all::
@(cd executer/kernel/mcctrl; make modules)
@(cd executer/kernel/mcoverlayfs; make modules)
@(cd executer/user; make)
@case "$(TARGET)" in \
all: executer-mcctrl executer-mcoverlayfs executer-user mckernel mck-tools
executer-mcctrl:
+@(cd executer/kernel/mcctrl; $(MAKE) modules)
executer-mcoverlayfs:
+@(cd executer/kernel/mcoverlayfs; $(MAKE) modules)
executer-user:
+@(cd executer/user; $(MAKE))
mckernel:
+@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86 | smp-arm64) \
(cd kernel; make) \
(cd kernel; $(MAKE)) \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
@ -18,13 +27,16 @@ all::
;; \
esac
install::
@(cd executer/kernel/mcctrl; make install)
@(cd executer/kernel/mcoverlayfs; make install)
@(cd executer/user; make install)
mck-tools:
+@(cd tools/mcstat; $(MAKE))
install:
@(cd executer/kernel/mcctrl; $(MAKE) install)
@(cd executer/kernel/mcoverlayfs; $(MAKE) install)
@(cd executer/user; $(MAKE) install)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86 | smp-arm64) \
(cd kernel; make install) \
(cd kernel; $(MAKE) install) \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
@ -32,51 +44,41 @@ install::
;; \
esac
@case "$(TARGET)" in \
attached-mic) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-attached-mic.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-attached-mic.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
builtin-x86) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-builtin-x86.sh $(SBINDIR)/mcreboot; \
install -m 755 arch/x86/tools/mcshutdown-builtin-x86.sh $(SBINDIR)/mcshutdown; \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
;; \
smp-x86 | smp-arm64) \
mkdir -p -m 755 $(SBINDIR); \
install -m 755 arch/x86/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
install -m 600 arch/x86/tools/mcoverlay-destroy-smp-x86.sh $(SBINDIR)/mcoverlay-destroy.sh; \
install -m 600 arch/x86/tools/mcoverlay-create-smp-x86.sh $(SBINDIR)/mcoverlay-create.sh; \
install -m 755 arch/x86/tools/eclair-dump-backtrace.exp $(SBINDIR)/eclair-dump-backtrace.exp;\
install -m 755 arch/x86_64/tools/mcreboot-smp-x86.sh $(SBINDIR)/mcreboot.sh; \
install -m 755 arch/x86_64/tools/mcstop+release-smp-x86.sh $(SBINDIR)/mcstop+release.sh; \
install -m 755 arch/x86_64/tools/mpimcexec $(BINDIR)/mpimcexec; \
install -m 755 arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh $(SBINDIR)/mcoverlay-destroy.sh; \
install -m 755 arch/x86_64/tools/mcoverlay-create-smp-x86.sh $(SBINDIR)/mcoverlay-create.sh; \
install -m 755 arch/x86_64/tools/eclair-dump-backtrace.exp $(SBINDIR)/eclair-dump-backtrace.exp;\
mkdir -p -m 755 $(ETCDIR); \
install -m 644 arch/x86/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
install -m 644 arch/x86_64/tools/irqbalance_mck.service $(ETCDIR)/irqbalance_mck.service; \
install -m 644 arch/x86_64/tools/irqbalance_mck.in $(ETCDIR)/irqbalance_mck.in; \
mkdir -p -m 755 $(INCDIR); \
install -m 644 kernel/include/swapfmt.h $(INCDIR); \
mkdir -p -m 755 $(MANDIR)/man1; \
install -m 644 arch/x86/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
install -m 644 arch/x86_64/tools/mcreboot.1 $(MANDIR)/man1/mcreboot.1; \
install -m 644 arch/x86_64/tools/mpimcexec.1 $(MANDIR)/man1/mpimcexec.1; \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
exit 1 \
;; \
esac
@(cd tools/mcstat/; $(MAKE) install)
clean::
@(cd executer/kernel/mcctrl; make clean)
@(cd executer/kernel/mcoverlayfs; make clean)
@(cd executer/user; make clean)
clean:
@(cd executer/kernel/mcctrl; $(MAKE) clean)
@(cd executer/kernel/mcoverlayfs; $(MAKE) clean)
@(cd executer/user; $(MAKE) clean)
@case "$(TARGET)" in \
attached-mic | builtin-x86 | builtin-mic | smp-x86 | smp-arm64) \
(cd kernel; make clean) \
(cd kernel; $(MAKE) clean) \
;; \
*) \
echo "unknown target $(TARGET)" >&2 \
exit 1 \
;; \
esac
@(cd tools/mcstat; $(MAKE) clean)

View File

@ -590,6 +590,8 @@ static void show_context_stack(struct pt_regs *regs)
return;
}
ihk_mc_debug_show_interrupt_context(regs);
sp = (uintptr_t)regs + sizeof(*regs);
stack_top = ALIGN_UP(sp, (uintptr_t)KERNEL_STACK_SIZE);
max_loop = (stack_top - sp) / min_stack_frame_size;
@ -1170,8 +1172,6 @@ void arch_clone_thread(struct thread *othread, unsigned long pc,
asm("mrs %0, tpidr_el0" : "=r" (tls));
othread->tlsblock_base = nthread->tlsblock_base = tls;
/* copy fp_regs values from parent. */
save_fp_regs(othread);
if ((othread->fp_regs != NULL) && (check_and_allocate_fp_regs(nthread) == 0)) {
memcpy(nthread->fp_regs, othread->fp_regs, sizeof(fp_regs_struct));
}
@ -1205,6 +1205,10 @@ void ihk_mc_delay_us(int us)
arch_delay(us);
}
void arch_print_stack()
{
}
void arch_show_interrupt_context(const void *reg)
{
const struct pt_regs *regs = (struct pt_regs *)reg;
@ -1428,6 +1432,13 @@ save_fp_regs(struct thread *thread)
}
}
void copy_fp_regs(struct thread *from, struct thread *to)
{
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
}
}
void
clear_fp_regs(struct thread *thread)
{
@ -1499,7 +1510,6 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = 0;
@ -1513,17 +1523,12 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
range = lookup_process_memory_range(vm, address, address+1);
if (range) {
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
} else {
__kprintf("address is out of range! \n");
}

View File

@ -221,7 +221,7 @@ int gencore(struct thread *thread, void *regs,
Elf64_Ehdr eh;
Elf64_Phdr *ph = NULL;
void *note = NULL;
struct vm_range *range;
struct vm_range *range, *next;
struct process_vm *vm = thread->vm;
int segs = 1; /* the first one is for NOTE */
int notesize, phsize, alignednotesize;
@ -235,7 +235,10 @@ int gencore(struct thread *thread, void *regs,
return -1;
}
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
dkprintf("start:%lx end:%lx flag:%lx objoff:%lx\n",
range->start, range->end, range->flag, range->objoff);
/* We omit reserved areas because they are only for
@ -323,7 +326,10 @@ int gencore(struct thread *thread, void *regs,
/* program header for each memory chunk */
i = 1;
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
unsigned long flag = range->flag;
unsigned long size = range->end - range->start;
@ -364,7 +370,10 @@ int gencore(struct thread *thread, void *regs,
dkprintf("coretable[2]: %lx@%lx(%lx)\n", ct[2].len, ct[2].addr, note);
i = 3; /* memory segments */
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
unsigned long phys;
if (range->flag & VR_RESERVED)

View File

@ -134,4 +134,12 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval)
return ret;
}
static inline int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
*dest = *(volatile uint32_t *)from;
return 0;
}
#endif /* !__HEADER_ARM64_COMMON_ARCH_FUTEX_H */

View File

@ -215,7 +215,7 @@ static const unsigned int PTL1_ENTRIES = __PTL1_ENTRIES;
#define PAGE_P2ALIGN 0
#define page_offset(addr) __page_offset(addr, PAGE_SIZE)
#define page_align(addr) __page_align(addr, PAGE_SIZE)
#define page_align_up(addr) __page_align_up((addr, PAGE_SIZE)
#define page_align_up(addr) __page_align_up(addr, PAGE_SIZE)
/*
* large page
@ -263,6 +263,8 @@ static const unsigned int PTL1_ENTRIES = __PTL1_ENTRIES;
#define PTE_FILEOFF PTE_SPECIAL
#define PT_ENTRIES (PAGE_SIZE >> 3)
#ifndef __ASSEMBLY__
#include <ihk/types.h>

View File

@ -25,6 +25,8 @@
#define smp_rmb() dmb(ishld)
#define smp_wmb() dmb(ishst)
#define arch_barrier() smp_mb()
#define smp_store_release(p, v) \
do { \
compiletime_assert_atomic_type(*p); \

View File

@ -0,0 +1,34 @@
#ifndef ARCH_RUSAGE_H_INCLUDED
#define ARCH_RUSAGE_H_INCLUDED
#include <arch-memory.h>
//#define DEBUG_RUSAGE
extern struct rusage_global *rusage;
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_16KB 1
#define IHK_OS_PGSIZE_64KB 2
static inline int rusage_pgsize_to_pgtype(size_t pgsize)
{
int ret = IHK_OS_PGSIZE_4KB;
switch (pgsize) {
case __PTL1_SIZE:
ret = IHK_OS_PGSIZE_4KB;
break;
case __PTL2_SIZE:
ret = IHK_OS_PGSIZE_16KB;
break;
case __PTL3_SIZE:
ret = IHK_OS_PGSIZE_64KB;
break;
default:
kprintf("%s: Error: Unknown pgsize=%ld\n", __FUNCTION__, pgsize);
break;
}
return ret;
}
#endif /* !defined(ARCH_RUSAGE_H_INCLUDED) */

View File

@ -6,6 +6,8 @@
#if defined(CONFIG_HAS_NMI)
#include <arm-gic-v3.h>
#else /* defined(CONFIG_HAS_NMI) */
#include <sysreg.h>
#endif /* defined(CONFIG_HAS_NMI) */
#if defined(CONFIG_HAS_NMI)

View File

@ -35,6 +35,8 @@
#define MIDR_IMPLEMENTOR(midr) \
(((midr) & MIDR_IMPLEMENTOR_MASK) >> MIDR_IMPLEMENTOR_SHIFT)
#define ARM_CPU_IMP_CAVIUM 0x43
#ifndef __ASSEMBLY__
static unsigned int read_cpuid_id(void)

View File

@ -73,6 +73,7 @@ static inline void pt_regs_write_reg(struct pt_regs *regs, int r,
#define ihk_mc_syscall_arg5(uc) (uc)->regs[5]
#define ihk_mc_syscall_ret(uc) (uc)->regs[0]
#define ihk_mc_syscall_number(uc) (uc)->regs[8]
#define ihk_mc_syscall_pc(uc) (uc)->pc
#define ihk_mc_syscall_sp(uc) (uc)->sp

View File

@ -144,5 +144,3 @@ SYSCALL_HANDLED(1045, signalfd)
SYSCALL_DELEGATED(1049, stat)
SYSCALL_DELEGATED(1060, getpgrp)
SYSCALL_DELEGATED(1062, time)
SYSCALL_HANDLED(1071, vfork)
SYSCALL_DELEGATED(1079, fork)

View File

@ -10,6 +10,8 @@
//#define DEBUG_GICV3
#define USE_CAVIUM_THUNDER_X
#ifdef DEBUG_GICV3
#define dkprintf(...) kprintf(__VA_ARGS__)
#define ekprintf(...) kprintf(__VA_ARGS__)
@ -18,6 +20,10 @@
#define ekprintf(...) kprintf(__VA_ARGS__)
#endif
#ifdef USE_CAVIUM_THUNDER_X
static char is_cavium_thunderx = 0;
#endif
void *dist_base;
void *rdist_base[NR_CPUS];
@ -108,8 +114,8 @@ static uint64_t gic_read_iar_cavium_thunderx(void)
asm volatile("nop;nop;nop;nop;");
asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat));
asm volatile("nop;nop;nop;nop;");
mb();
#endif /* CONFIG_HAS_NMI */
mb();
return irqstat;
}
@ -118,7 +124,7 @@ static uint64_t gic_read_iar_cavium_thunderx(void)
static uint64_t gic_read_iar(void)
{
#ifdef USE_CAVIUM_THUNDER_X
if (static_key_false(&is_cavium_thunderx))
if (is_cavium_thunderx)
return gic_read_iar_cavium_thunderx();
else
#endif
@ -266,6 +272,7 @@ void arm64_issue_ipi_gicv3(uint32_t cpuid, uint32_t vector)
{
dkprintf("Send irq#%d to cpuid=%d\n", vector, cpuid);
barrier();
if(vector < 16){
// send SGI
arm64_raise_sgi_gicv3(cpuid, vector);
@ -304,7 +311,9 @@ void gic_dist_init_gicv3(unsigned long dist_base_pa, unsigned long size)
#ifdef USE_CAVIUM_THUNDER_X
/* Cavium ThunderX erratum 23154 */
gicv3_check_capabilities();
if (MIDR_IMPLEMENTOR(read_cpuid_id()) == ARM_CPU_IMP_CAVIUM) {
is_cavium_thunderx = 1;
}
#endif
}
@ -399,6 +408,7 @@ void gic_enable_gicv3(void)
/* Set specific IPI to NMI */
writeb_relaxed(GICD_INT_NMI_PRI, rd_sgi_base + GIC_DIST_PRI + INTRID_CPU_STOP);
writeb_relaxed(GICD_INT_NMI_PRI, rd_sgi_base + GIC_DIST_PRI + INTRID_MEMDUMP);
writeb_relaxed(GICD_INT_NMI_PRI, rd_sgi_base + GIC_DIST_PRI + INTRID_STACK_TRACE);
/* sync wait */
gic_do_wait_for_rwp(rbase);

View File

@ -20,10 +20,11 @@ size_t arm64_cpu_local_variables_span = LOCALS_SPAN; /* for debugger */
void init_processors_local(int max_id)
{
int i = 0;
const int sz = (max_id + 1) * KERNEL_STACK_SIZE;
union arm64_cpu_local_variables *tmp;
/* allocate one more for alignment */
locals = ihk_mc_alloc_pages((max_id + 1) * (KERNEL_STACK_SIZE / PAGE_SIZE), IHK_MC_AP_CRITICAL);
locals = ihk_mc_alloc_pages(((sz + PAGE_SIZE - 1) / PAGE_SIZE), IHK_MC_AP_CRITICAL);
locals = (union arm64_cpu_local_variables *)ALIGN_UP((unsigned long)locals, KERNEL_STACK_SIZE);
/* clear struct process, struct process_vm, struct thread_info area */

View File

@ -1760,6 +1760,12 @@ int visit_pte_range(page_table_t pt, void *start0, void *end0, int pgshift,
return initial_lookup.walk(tt, 0, start, end, initial_lookup.callback, &args);
}
int visit_pte_range_safe(page_table_t pt, void *start0, void *end0, int pgshift,
enum visit_pte_flag flags, pte_visitor_t *funcp, void *arg)
{
return 0;
}
struct clear_range_args {
int free_physical;
struct memobj *memobj;
@ -1801,7 +1807,6 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
ihk_mc_free_pages_user(phys_to_virt(phys), npages);
dkprintf("%s: freeing regular page at 0x%lx\n", __FUNCTION__, base);
}
args->vm->currss -= PTL1_SIZE;
}
return 0;
@ -1881,7 +1886,6 @@ static int clear_range_middle(void *args0, pte_t *ptep, uint64_t base,
ihk_mc_free_pages_user(phys_to_virt(phys), npages);
dkprintf("%s(level=%d): freeing large page at 0x%lx\n", __FUNCTION__, level, base);
}
args->vm->currss -= tbl.pgsize;
}
return 0;

View File

@ -965,7 +965,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
proc->status = PS_TRACED;
#endif /* POSTK_DEBUG_TEMP_FIX_41 */
thread->status = PS_TRACED;
proc->ptrace &= ~PT_TRACE_SYSCALL_MASK;
proc->ptrace &= ~PT_TRACE_SYSCALL;
if (sig == SIGSTOP || sig == SIGTSTP ||
sig == SIGTTIN || sig == SIGTTOU) {
proc->signal_flags |= SIGNAL_STOP_STOPPED;

View File

@ -12,6 +12,8 @@
#include <lwk/compiler.h>
#include <hwcap.h>
#include <prctl.h>
#include <limits.h>
#include <syscall.h>
extern void ptrace_report_signal(struct thread *thread, int sig);
extern void clear_single_step(struct thread *thread);
@ -1321,6 +1323,17 @@ interrupt_from_user(void *regs0)
return((regs->pstate & PSR_MODE_MASK) == PSR_MODE_EL0t);
}
void save_syscall_return_value(int num, unsigned long rc)
{
/*
* Save syscall return value.
*/
if (cpu_local_var(current) && cpu_local_var(current)->uctx &&
num != __NR_rt_sigsuspend) {
ihk_mc_syscall_arg0(cpu_local_var(current)->uctx) = rc;
}
}
void
check_signal(unsigned long rc, void *regs0, int num)
{
@ -1345,16 +1358,6 @@ __check_signal(unsigned long rc, void *regs0, int num, int irq_disabled)
return;
thread = cpu_local_var(current);
/**
* If check_signal is called from syscall(),
* then save syscall return value.
*/
if((regs == NULL)&&(num != __NR_rt_sigsuspend)){ /* It's call from syscall! */
// Get user context through current thread
// and update syscall return.
ihk_mc_syscall_arg0(thread->uctx) = rc;
}
if(thread == NULL || thread->proc->pid == 0){
struct thread *t;
irqstate = ihk_mc_spinlock_lock(&(cpu_local_var(runq_lock)));
@ -1866,4 +1869,645 @@ save_uctx(void *uctx, struct pt_regs *regs)
/* TODO: skeleton for UTI */
}
int do_process_vm_read_writev(int pid,
const struct iovec *local_iov,
unsigned long liovcnt,
const struct iovec *remote_iov,
unsigned long riovcnt,
unsigned long flags,
int op)
{
int ret = -EINVAL;
int li, ri;
int pli, pri;
off_t loff, roff;
size_t llen = 0, rlen = 0;
size_t copied = 0;
size_t to_copy;
struct thread *lthread = cpu_local_var(current);
struct process *rproc;
struct process *lproc = lthread->proc;
struct process_vm *rvm = NULL;
unsigned long rphys;
unsigned long rpage_left;
unsigned long psize;
void *rva;
struct vm_range *range;
struct mcs_rwlock_node_irqsave lock;
struct mcs_rwlock_node update_lock;
/* Sanity checks */
if (flags) {
return -EINVAL;
}
if (liovcnt > IOV_MAX || riovcnt > IOV_MAX) {
return -EINVAL;
}
/* Check if parameters are okay */
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov,
(uintptr_t)(local_iov + liovcnt * sizeof(struct iovec)));
if (!range) {
ret = -EFAULT;
goto arg_out;
}
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)remote_iov,
(uintptr_t)(remote_iov + riovcnt * sizeof(struct iovec)));
if (!range) {
ret = -EFAULT;
goto arg_out;
}
ret = 0;
arg_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
}
for (li = 0; li < liovcnt; ++li) {
llen += local_iov[li].iov_len;
dkprintf("local_iov[%d].iov_base: 0x%lx, len: %lu\n",
li, local_iov[li].iov_base, local_iov[li].iov_len);
}
for (ri = 0; ri < riovcnt; ++ri) {
rlen += remote_iov[ri].iov_len;
dkprintf("remote_iov[%d].iov_base: 0x%lx, len: %lu\n",
ri, remote_iov[ri].iov_base, remote_iov[ri].iov_len);
}
if (llen != rlen) {
return -EINVAL;
}
/* Find remote process */
rproc = find_process(pid, &lock);
if (!rproc) {
ret = -ESRCH;
goto out;
}
mcs_rwlock_reader_lock_noirq(&rproc->update_lock, &update_lock);
if(rproc->status == PS_EXITED ||
rproc->status == PS_ZOMBIE){
mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock);
process_unlock(rproc, &lock);
ret = -ESRCH;
goto out;
}
rvm = rproc->vm;
hold_process_vm(rvm);
mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock);
process_unlock(rproc, &lock);
if (lproc->euid != 0 &&
(lproc->ruid != rproc->ruid ||
lproc->ruid != rproc->euid ||
lproc->ruid != rproc->suid ||
lproc->rgid != rproc->rgid ||
lproc->rgid != rproc->egid ||
lproc->rgid != rproc->sgid)) {
ret = -EPERM;
goto out;
}
dkprintf("pid %d found, doing %s: liovcnt: %d, riovcnt: %d \n", pid,
(op == PROCESS_VM_READ) ? "PROCESS_VM_READ" : "PROCESS_VM_WRITE",
liovcnt, riovcnt);
pli = pri = -1; /* Previous indeces in iovecs */
li = ri = 0; /* Current indeces in iovecs */
loff = roff = 0; /* Offsets in current iovec */
/* Now iterate and do the copy */
while (copied < llen) {
int faulted = 0;
/* New local vector? */
if (pli != li) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov[li].iov_base,
(uintptr_t)(local_iov[li].iov_base + 1));
if (!range) {
ret = -EFAULT;
goto pli_out;
}
/* Is range valid? */
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov[li].iov_base,
(uintptr_t)(local_iov[li].iov_base + local_iov[li].iov_len));
if (range == NULL) {
ret = -EINVAL;
goto pli_out;
}
if (!(range->flag & ((op == PROCESS_VM_READ) ?
VR_PROT_WRITE : VR_PROT_READ))) {
ret = -EFAULT;
goto pli_out;
}
ret = 0;
pli_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
}
pli = li;
}
/* New remote vector? */
if (pri != ri) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(rvm,
(uintptr_t)remote_iov[li].iov_base,
(uintptr_t)(remote_iov[li].iov_base + 1));
if (range == NULL) {
ret = -EFAULT;
goto pri_out;
}
/* Is range valid? */
range = lookup_process_memory_range(rvm,
(uintptr_t)remote_iov[li].iov_base,
(uintptr_t)(remote_iov[li].iov_base + remote_iov[li].iov_len));
if (range == NULL) {
ret = -EINVAL;
goto pri_out;
}
if (!(range->flag & ((op == PROCESS_VM_READ) ?
VR_PROT_READ : VR_PROT_WRITE))) {
ret = -EFAULT;
goto pri_out;
}
ret = 0;
pri_out:
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
if (ret != 0) {
goto out;
}
pri = ri;
}
/* Figure out how much we can copy at most in this iteration */
to_copy = (local_iov[li].iov_len - loff);
if ((remote_iov[ri].iov_len - roff) < to_copy) {
to_copy = remote_iov[ri].iov_len - roff;
}
retry_lookup:
/* TODO: remember page and do this only if necessary */
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
remote_iov[ri].iov_base + roff, &rphys, &psize);
if (ret) {
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
void *addr;
if (faulted) {
ret = -EFAULT;
goto out;
}
/* Fault in pages */
for (addr = (void *)
(((unsigned long)remote_iov[ri].iov_base + roff)
& PAGE_MASK);
addr < (remote_iov[ri].iov_base + roff + to_copy);
addr += PAGE_SIZE) {
ret = page_fault_process_vm(rvm, addr, reason);
if (ret) {
ret = -EFAULT;
goto out;
}
}
faulted = 1;
goto retry_lookup;
}
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
psize) & ~(psize - 1)) -
((unsigned long)remote_iov[ri].iov_base + roff));
if (rpage_left < to_copy) {
to_copy = rpage_left;
}
rva = phys_to_virt(rphys);
fast_memcpy(
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
to_copy);
copied += to_copy;
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
li, local_iov[li].iov_base + loff,
(op == PROCESS_VM_READ) ? "<-" : "->",
ri, remote_iov[ri].iov_base + roff, to_copy,
psize, rpage_left);
loff += to_copy;
roff += to_copy;
if (loff == local_iov[li].iov_len) {
li++;
loff = 0;
}
if (roff == remote_iov[ri].iov_len) {
ri++;
roff = 0;
}
}
release_process_vm(rvm);
return copied;
out:
if(rvm)
release_process_vm(rvm);
return ret;
}
int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg)
{
int i, i_s, i_e, phase = 1;
struct move_pages_smp_req *mpsr =
(struct move_pages_smp_req *)arg;
struct process_vm *vm = mpsr->proc->vm;
int count = mpsr->count;
struct page_table *save_pt;
extern struct page_table *get_init_page_table(void);
i_s = (count / nr_cpus) * cpu_index;
i_e = i_s + (count / nr_cpus);
if (cpu_index == (nr_cpus - 1)) {
i_e = count;
}
/* Load target process' PT so that we can access user-space */
save_pt = cpu_local_var(current) == &cpu_local_var(idle) ?
get_init_page_table() :
cpu_local_var(current)->vm->address_space->page_table;
if (save_pt != vm->address_space->page_table) {
ihk_mc_load_page_table(vm->address_space->page_table);
}
else {
save_pt = NULL;
}
if (nr_cpus == 1) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus > 1 && nr_cpus < 4) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
case 1:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus >= 4 && nr_cpus < 8) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
break;
case 1:
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
break;
case 2:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
mpsr->nodes_ready = 1;
break;
case 3:
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
break;
default:
break;
}
}
else if (nr_cpus >= 8) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * (count / 2));
break;
case 1:
memcpy(mpsr->virt_addr + (count / 2),
mpsr->user_virt_addr + (count / 2),
sizeof(void *) * (count / 2));
break;
case 2:
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
break;
case 3:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
mpsr->nodes_ready = 1;
break;
case 4:
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
break;
case 5:
memset(mpsr->status, 0, sizeof(int) * count);
break;
case 6:
memset(mpsr->nr_pages, 0, sizeof(int) * count);
break;
case 7:
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
break;
default:
break;
}
}
while (!(volatile int)mpsr->nodes_ready) {
cpu_pause();
}
/* NUMA verification in parallel */
for (i = i_s; i < i_e; i++) {
if (mpsr->nodes[i] < 0 ||
mpsr->nodes[i] >= ihk_mc_get_nr_numa_nodes() ||
!test_bit(mpsr->nodes[i],
mpsr->proc->vm->numa_mask)) {
mpsr->phase_ret = -EINVAL;
break;
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* PTE lookup in parallel */
for (i = i_s; i < i_e; i++) {
void *phys;
size_t pgsize;
int p2align;
/*
* XXX: No page structures for anonymous mappings.
* Look up physical addresses by scanning page tables.
*/
mpsr->ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)mpsr->virt_addr[i], 0, &phys, &pgsize, &p2align);
/* PTE valid? */
if (!mpsr->ptep[i] || !pte_is_present(mpsr->ptep[i])) {
mpsr->status[i] = -ENOENT;
mpsr->ptep[i] = NULL;
continue;
}
/* PTE is file? */
if (pte_is_fileoff(mpsr->ptep[i], PAGE_SIZE)) {
mpsr->status[i] = -EINVAL;
mpsr->ptep[i] = NULL;
continue;
}
dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n",
__FUNCTION__, mpsr->virt_addr[i], pgsize, mpsr->nodes[i]);
/* Large page? */
if (pgsize > PAGE_SIZE) {
int nr_sub_pages = (pgsize / PAGE_SIZE);
int j;
if (i + nr_sub_pages > count) {
kprintf("%s: ERROR: page at index %d exceeds the region\n",
__FUNCTION__, i);
mpsr->status[i] = -EINVAL;
break;
}
/* Is it contiguous across nr_sub_pages and all
* requested to be moved to the same target node? */
for (j = 0; j < nr_sub_pages; ++j) {
if (mpsr->virt_addr[i + j] !=
(mpsr->virt_addr[i] + (j * PAGE_SIZE)) ||
mpsr->nodes[i] != mpsr->nodes[i + j]) {
kprintf("%s: ERROR: virt address or node at index %d"
" is inconsistent\n",
__FUNCTION__, i + j);
mpsr->phase_ret = -EINVAL;
goto pte_out;
}
}
mpsr->nr_pages[i] = nr_sub_pages;
i += (nr_sub_pages - 1);
}
else {
mpsr->nr_pages[i] = 1;
}
}
pte_out:
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
if (cpu_index == 0) {
/* Allocate new pages on target NUMA nodes */
for (i = 0; i < count; i++) {
int pgalign = 0;
int j;
void *dst;
if (!mpsr->ptep[i] || mpsr->status[i] < 0 || !mpsr->nr_pages[i])
continue;
/* TODO: store pgalign info in an array as well? */
if (mpsr->nr_pages[i] > 1) {
if (mpsr->nr_pages[i] * PAGE_SIZE == PTL2_SIZE)
pgalign = PTL2_SHIFT - PTL1_SHIFT;
}
dst = ihk_mc_alloc_aligned_pages_node(mpsr->nr_pages[i],
pgalign, IHK_MC_AP_USER, mpsr->nodes[i]);
if (!dst) {
mpsr->status[i] = -ENOMEM;
continue;
}
for (j = i; j < (i + mpsr->nr_pages[i]); ++j) {
mpsr->status[j] = mpsr->nodes[i];
}
mpsr->dst_phys[i] = virt_to_phys(dst);
dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d,"
" allocated phys: 0x%lx\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->nodes[i], pgalign, mpsr->dst_phys[i]);
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* Copy, PTE update, memfree in parallel */
for (i = i_s; i < i_e; ++i) {
if (!mpsr->dst_phys[i])
continue;
fast_memcpy(phys_to_virt(mpsr->dst_phys[i]),
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i] * PAGE_SIZE);
ihk_mc_free_pages(
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i]);
pte_update_phys(mpsr->ptep[i], mpsr->dst_phys[i]);
dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->dst_phys[i]);
}
/* XXX: do a separate SMP call with only CPUs running threads
* of this process? */
if (cpu_local_var(current)->proc == mpsr->proc) {
/* Invalidate all TLBs */
for (i = 0; i < mpsr->count; i++) {
if (!mpsr->dst_phys[i])
continue;
flush_tlb_single((unsigned long)mpsr->virt_addr[i]);
}
}
out:
if (save_pt) {
ihk_mc_load_page_table(save_pt);
}
return mpsr->phase_ret;
}
time_t time(void) {
struct timespec ats;
if (gettime_local_support) {
calculate_time_from_tsc(&ats);
return ats.tv_sec;
}
return (time_t)0;
}
/*** End of File ***/

View File

@ -1,7 +0,0 @@
IHK_OBJS += cpu.o interrupt.o memory.o trampoline.o local.o context.o
IHK_OBJS += perfctr.o syscall.o vsyscall.o
# POSTK_DEBUG_ARCH_DEP_18 coredump arch separation.
# IHK_OBJS added coredump.o
ifeq ($(ARCH), arm64)
IHK_OBJS += coredump.o
endif

View File

@ -0,0 +1,2 @@
IHK_OBJS += cpu.o interrupt.o memory.o trampoline.o local.o context.o
IHK_OBJS += perfctr.o syscall.o vsyscall.o

View File

@ -849,6 +849,7 @@ void setup_x86_ap(void (*next_func)(void))
void arch_show_interrupt_context(const void *reg);
void set_signal(int sig, void *regs, struct siginfo *info);
void check_signal(unsigned long, void *, int);
void check_sig_pending();
extern void tlb_flush_handler(int vector);
void __show_stack(uintptr_t *sp) {
@ -870,6 +871,19 @@ void show_context_stack(uintptr_t *rbp) {
return;
}
void interrupt_exit(struct x86_user_context *regs)
{
if (interrupt_from_user(regs)) {
cpu_enable_interrupt();
check_sig_pending();
check_need_resched();
check_signal(0, regs, 0);
}
else {
check_sig_pending();
}
}
void handle_interrupt(int vector, struct x86_user_context *regs)
{
struct ihk_mc_interrupt_handler *h;
@ -992,12 +1006,8 @@ void handle_interrupt(int vector, struct x86_user_context *regs)
}
}
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
interrupt_exit(regs);
set_cputime(interrupt_from_user(regs)? 0: 1);
--v->in_interrupt;
}
@ -1012,13 +1022,9 @@ void gpe_handler(struct x86_user_context *regs)
panic("gpe_handler");
}
set_signal(SIGSEGV, regs, NULL);
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
// panic("GPF");
interrupt_exit(regs);
set_cputime(interrupt_from_user(regs)? 0: 1);
panic("GPF");
}
void debug_handler(struct x86_user_context *regs)
@ -1045,12 +1051,8 @@ void debug_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info);
info.si_code = si_code;
set_signal(SIGTRAP, regs, &info);
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
interrupt_exit(regs);
set_cputime(interrupt_from_user(regs)? 0: 1);
}
void int3_handler(struct x86_user_context *regs)
@ -1067,12 +1069,8 @@ void int3_handler(struct x86_user_context *regs)
memset(&info, '\0', sizeof info);
info.si_code = TRAP_BRKPT;
set_signal(SIGTRAP, regs, &info);
if(interrupt_from_user(regs)){
cpu_enable_interrupt();
check_signal(0, regs, 0);
check_need_resched();
}
set_cputime(0);
interrupt_exit(regs);
set_cputime(interrupt_from_user(regs)? 0: 1);
}
void
@ -1081,7 +1079,6 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
const uintptr_t address = (uintptr_t)fault_addr;
struct process_vm *vm = thread->vm;
struct vm_range *range;
char found;
unsigned long irqflags;
unsigned long error = ((struct x86_user_context *)regs)->gpr.error;
@ -1095,17 +1092,12 @@ unhandled_page_fault(struct thread *thread, void *fault_addr, void *regs)
(error & PF_RSVD ? "was" : "wasn't"),
(error & PF_INSTR ? "was" : "wasn't"));
found = 0;
list_for_each_entry(range, &vm->vm_range_list, list) {
if (range->start <= address && range->end > address) {
found = 1;
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
break;
}
}
if (!found) {
range = lookup_process_memory_range(vm, address, address+1);
if (range) {
__kprintf("address is in range, flag: 0x%lx\n",
range->flag);
ihk_mc_pt_print_pte(vm->address_space->page_table, (void*)address);
} else {
__kprintf("address is out of range! \n");
}
@ -1233,6 +1225,13 @@ void cpu_pause(void)
asm volatile("pause" ::: "memory");
}
/* From: kernel-xppsl_1.5.2/arch/x86/include/asm/processor.h */
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
void cpu_relax(void)
{
asm volatile("rep; nop" ::: "memory");
}
/*@
@ assigns \nothing;
@ ensures \interrupt_disabled > 0;
@ -1473,29 +1472,91 @@ void ihk_mc_delay_us(int us)
arch_delay(us);
}
#define EXTENDED_ARCH_SHOW_CONTEXT
#ifdef EXTENDED_ARCH_SHOW_CONTEXT
void arch_show_extended_context(void)
{
unsigned long cr0, cr4, msr, xcr0;
unsigned long cr0, cr4, msr, xcr0 = 0;
/* Read and print CRs, MSR_EFER, XCR0 */
asm volatile("movq %%cr0, %0" : "=r"(cr0));
asm volatile("movq %%cr4, %0" : "=r"(cr4));
msr = rdmsr(MSR_EFER);
xcr0 = xgetbv(0);
if (xsave_available) {
xcr0 = xgetbv(0);
}
__kprintf("\n CR0 CR4\n");
__kprintf("%016lX %016lX\n", cr0, cr4);
__kprintf(" MSR_EFER\n");
__kprintf("%016lX\n", msr);
__kprintf(" XCR0\n");
__kprintf("%016lX\n", xcr0);
if (xsave_available) {
__kprintf(" XCR0\n");
__kprintf("%016lX\n", xcr0);
}
}
struct stack {
struct stack *rbp;
unsigned long eip;
};
/* KPRINTF_LOCAL_BUF_LEN is 1024, useless to go further */
#define STACK_BUF_LEN (1024-sizeof("[ 0]: "))
static void __print_stack(struct stack *rbp, unsigned long first) {
char buf[STACK_BUF_LEN];
size_t len;
/* Build string in buffer to output a single line */
len = snprintf(buf, STACK_BUF_LEN,
"addr2line -e smp-x86/kernel/mckernel.img -fpia");
if (first)
len += snprintf(buf + len, STACK_BUF_LEN - len,
" %#16lx", first);
while ((unsigned long)rbp > 0xffff880000000000 &&
STACK_BUF_LEN - len > sizeof(" 0x0123456789abcdef")) {
len += snprintf(buf + len, STACK_BUF_LEN - len,
" %#16lx", rbp->eip);
rbp = rbp->rbp;
}
__kprintf("%s\n", buf);
}
void arch_print_pre_interrupt_stack(const struct x86_basic_regs *regs) {
struct stack *rbp;
/* only for kernel stack */
if (regs->error & PF_USER)
return;
__kprintf("Pre-interrupt stack trace:\n");
/* interrupt stack heuristics:
* - the first entry looks like it is always garbage, so skip.
* (that is done by taking regs->rsp instead of &regs->rsp)
* - that still looks sometimes wrong. For now, if it is not
* within 64k of itself, look for the next entry that matches.
*/
rbp = (struct stack*)regs->rsp;
while ((uintptr_t)rbp > (uintptr_t)rbp->rbp
|| (uintptr_t)rbp + 0x10000 < (uintptr_t)rbp->rbp)
rbp = (struct stack *)(((uintptr_t *)rbp) + 1);
__print_stack(rbp, regs->rip);
}
void arch_print_stack() {
struct stack *rbp;
__kprintf("Approximative stack trace:\n");
asm("mov %%rbp, %0" : "=r"(rbp) );
__print_stack(rbp, 0);
}
#endif
/*@
@ requires \valid(reg);
@ -1526,9 +1587,11 @@ void arch_show_interrupt_context(const void *reg)
__kprintf("%16lx %16lx %16lx %16lx\n",
regs->cs, regs->ss, regs->rflags, regs->error);
#ifdef EXTENDED_ARCH_SHOW_CONTEXT
arch_show_extended_context();
#endif
kprintf_unlock(irqflags);
return;
arch_show_extended_context();
arch_print_pre_interrupt_stack(regs);
kprintf_unlock(irqflags);
}
@ -1651,13 +1714,11 @@ release_fp_regs(struct thread *thread)
thread->fp_regs = NULL;
}
/*@
@ requires \valid(thread);
@*/
void
save_fp_regs(struct thread *thread)
static int
check_and_allocate_fp_regs(struct thread *thread)
{
int pages;
int pages;
int result = 0;
if (!thread->fp_regs) {
pages = (xsave_size + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
@ -1666,12 +1727,26 @@ save_fp_regs(struct thread *thread)
if (!thread->fp_regs) {
kprintf("error: allocating fp_regs pages\n");
return;
result = 1;
goto out;
}
memset(thread->fp_regs, 0, sizeof(fp_regs_struct));
memset(thread->fp_regs, 0, pages * PAGE_SIZE);
}
out:
return result;
}
/*@
@ requires \valid(thread);
@*/
void
save_fp_regs(struct thread *thread)
{
if (check_and_allocate_fp_regs(thread) != 0) {
// alloc error
return;
}
if (xsave_available) {
unsigned int low, high;
@ -1687,6 +1762,13 @@ save_fp_regs(struct thread *thread)
}
}
void copy_fp_regs(struct thread *from, struct thread *to)
{
if ((from->fp_regs != NULL) && (check_and_allocate_fp_regs(to) == 0)) {
memcpy(to->fp_regs, from->fp_regs, sizeof(fp_regs_struct));
}
}
#ifdef POSTK_DEBUG_TEMP_FIX_19
void
clear_fp_regs(struct thread *thread)

View File

@ -289,7 +289,7 @@ int gencore(struct thread *thread, void *regs,
Elf64_Ehdr eh;
Elf64_Phdr *ph = NULL;
void *note = NULL;
struct vm_range *range;
struct vm_range *range, *next;
struct process_vm *vm = thread->vm;
int segs = 1; /* the first one is for NOTE */
int notesize, phsize, alignednotesize;
@ -303,13 +303,18 @@ int gencore(struct thread *thread, void *regs,
return -1;
}
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
dkprintf("start:%lx end:%lx flag:%lx objoff:%lx\n",
range->start, range->end, range->flag, range->objoff);
/* We omit reserved areas because they are only for
mckernel's internal use. */
if (range->flag & VR_RESERVED)
continue;
if (range->flag & VR_DONTDUMP)
continue;
/* We need a chunk for each page for a demand paging area.
This can be optimized for spacial complexity but we would
lose simplicity instead. */
@ -391,7 +396,10 @@ int gencore(struct thread *thread, void *regs,
/* program header for each memory chunk */
i = 1;
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
unsigned long flag = range->flag;
unsigned long size = range->end - range->start;
@ -432,7 +440,10 @@ int gencore(struct thread *thread, void *regs,
dkprintf("coretable[2]: %lx@%lx(%lx)\n", ct[2].len, ct[2].addr, note);
i = 3; /* memory segments */
list_for_each_entry(range, &vm->vm_range_list, list) {
next = lookup_process_memory_range(vm, 0, -1);
while ((range = next)) {
next = next_process_memory_range(vm, range);
unsigned long phys;
if (range->flag & VR_RESERVED)

View File

@ -64,7 +64,6 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
return oldval;
}
#ifdef POSTK_DEBUG_ARCH_DEP_8 /* arch depend hide */
static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
{
int op = (encoded_op >> 28) & 7;
@ -128,6 +127,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
}
return ret;
}
#endif /* !POSTK_DEBUG_ARCH_DEP_8 */
static inline int get_futex_value_locked(uint32_t *dest, uint32_t *from)
{
*dest = *(volatile uint32_t *)from;
return 0;
}
#endif

View File

@ -14,7 +14,17 @@
int __kprintf(const char *format, ...);
#endif
typedef int ihk_spinlock_t;
typedef unsigned short __ticket_t;
typedef unsigned int __ticketpair_t;
typedef struct ihk_spinlock {
union {
__ticketpair_t head_tail;
struct __raw_tickets {
__ticket_t head, tail;
} tickets;
};
} ihk_spinlock_t;
extern void preempt_enable(void);
extern void preempt_disable(void);
@ -23,9 +33,9 @@ extern void preempt_disable(void);
static void ihk_mc_spinlock_init(ihk_spinlock_t *lock)
{
*lock = 0;
lock->head_tail = 0;
}
#define SPIN_LOCK_UNLOCKED 0
#define SPIN_LOCK_UNLOCKED { .head_tail = 0 }
#ifdef DEBUG_SPINLOCK
#define ihk_mc_spinlock_lock_noirq(l) { \
@ -39,40 +49,24 @@ __kprintf("[%d] ret ihk_mc_spinlock_lock_noirq\n", ihk_mc_get_processor_id()); \
static void __ihk_mc_spinlock_lock_noirq(ihk_spinlock_t *lock)
{
int inc = 0x00010000;
int tmp;
#if 0
asm volatile("lock ; xaddl %0, %1\n"
"movzwl %w0, %2\n\t"
"shrl $16, %0\n\t"
"1:\t"
"cmpl %0, %2\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"movzwl %1, %2\n\t"
"jmp 1b\n"
"2:"
: "+Q" (inc), "+m" (*lock), "=r" (tmp) : : "memory", "cc");
#endif
register struct __raw_tickets inc = { .tail = 0x0002 };
preempt_disable();
asm volatile("lock; xaddl %0, %1\n"
"movzwl %w0, %2\n\t"
"shrl $16, %0\n\t"
"1:\t"
"cmpl %0, %2\n\t"
"je 2f\n\t"
"rep ; nop\n\t"
"movzwl %1, %2\n\t"
/* don't need lfence here, because loads are in-order */
"jmp 1b\n"
"2:"
: "+r" (inc), "+m" (*lock), "=&r" (tmp)
:
: "memory", "cc");
asm volatile ("lock xaddl %0, %1\n"
: "+r" (inc), "+m" (*(lock)) : : "memory", "cc");
if (inc.head == inc.tail)
goto out;
for (;;) {
if (*((volatile __ticket_t *)&lock->tickets.head) == inc.tail)
goto out;
cpu_pause();
}
out:
barrier(); /* make sure nothing creeps before the lock is taken */
}
#ifdef DEBUG_SPINLOCK
@ -106,8 +100,11 @@ __kprintf("[%d] ret ihk_mc_spinlock_unlock_noirq\n", ihk_mc_get_processor_id());
#endif
static void __ihk_mc_spinlock_unlock_noirq(ihk_spinlock_t *lock)
{
asm volatile ("lock incw %0" : "+m"(*lock) : : "memory", "cc");
__ticket_t inc = 0x0002;
asm volatile ("lock addw %1, %0\n"
: "+m" (lock->tickets.head) : "ri" (inc) : "memory", "cc");
preempt_enable();
}
@ -134,6 +131,8 @@ typedef struct mcs_lock_node {
unsigned long irqsave;
} __attribute__((aligned(64))) mcs_lock_node_t;
typedef mcs_lock_node_t mcs_lock_t;
static void mcs_lock_init(struct mcs_lock_node *node)
{
node->locked = 0;

View File

@ -40,18 +40,42 @@
#define LARGE_PAGE_MASK (~((unsigned long)LARGE_PAGE_SIZE - 1))
#define LARGE_PAGE_P2ALIGN (LARGE_PAGE_SHIFT - PAGE_SHIFT)
#define GB_PAGE_SHIFT 30
#define GB_PAGE_SIZE (1UL << GB_PAGE_SHIFT)
#define GB_PAGE_MASK (~((unsigned long)GB_PAGE_SIZE - 1))
#define GB_PAGE_P2ALIGN (GB_PAGE_SHIFT - PAGE_SHIFT)
#define USER_END 0x0000800000000000UL
#define TASK_UNMAPPED_BASE 0x00002AAAAAA00000UL
/*
* Canonical negative addresses (i.e., the smallest kernel virtual address)
* on x86 64 bit mode (in its most restricted 48 bit format) starts from
* 0xffff800000000000, but Linux starts mapping physical memory at 0xffff880000000000.
* The 0x80000000000 long gap (8TBs, i.e., 16 PGD level entries in the page tables)
* is used for Xen hyervisor (see arch/x86/include/asm/page.h) and that is
* what we utilize for McKernel.
* This gives us the benefit of being able to use Linux kernel virtual
* addresses identically as in Linux.
*
* NOTE: update these also in eclair.c when modified!
*/
#define MAP_ST_START 0xffff800000000000UL
#define MAP_VMAP_START 0xfffff00000000000UL
#define MAP_FIXED_START 0xffffffff70000000UL
#define MAP_KERNEL_START 0xffffffff80000000UL
#define MAP_VMAP_START 0xffff850000000000UL
#define MAP_FIXED_START 0xffff860000000000UL
#define LINUX_PAGE_OFFSET 0xffff880000000000UL
/*
* MAP_KERNEL_START is 8MB below MODULES_END in Linux.
* Placing the LWK image in the virtual address space at the end of
* the Linux modules section enables us to map the LWK TEXT in Linux
* as well, so that Linux can also call into LWK text.
*/
#define MAP_KERNEL_START 0xFFFFFFFFFE800000UL
#define STACK_TOP(region) ((region)->user_end)
#define MAP_VMAP_SIZE 0x0000000100000000UL
#define KERNEL_PHYS_OFFSET MAP_ST_START
#define PTL4_SHIFT 39
#define PTL4_SIZE (1UL << PTL4_SHIFT)
#define PTL3_SHIFT 30

View File

@ -13,16 +13,16 @@
#ifndef ARCH_CPU_H
#define ARCH_CPU_H
#include <ihk/cpu.h>
#define arch_barrier() asm volatile("" : : : "memory")
static inline void rmb(void)
{
barrier();
arch_barrier();
}
static inline void wmb(void)
{
barrier();
arch_barrier();
}
static unsigned long read_tsc(void)

View File

@ -133,7 +133,7 @@ static inline void ihk_atomic64_inc(ihk_atomic64_t *v)
* Note 2: xchg has side effect, so that attribute volatile is necessary,
* but generally the primitive is invalid, *ptr is output argument. --ANK
*/
#define __xg(x) ((volatile long *)(x))
#define __xg(x) ((volatile typeof(x))(x))
#define xchg4(ptr, x) \
({ \

View File

@ -49,6 +49,7 @@ typedef struct x86_user_context ihk_mc_user_context_t;
#define ihk_mc_syscall_arg5(uc) (uc)->gpr.r9
#define ihk_mc_syscall_ret(uc) (uc)->gpr.rax
#define ihk_mc_syscall_number(uc) (uc)->gpr.orig_rax
#define ihk_mc_syscall_pc(uc) (uc)->gpr.rip
#define ihk_mc_syscall_sp(uc) (uc)->gpr.rsp

View File

@ -189,9 +189,30 @@ struct tss64 {
} __attribute__((packed));
struct x86_basic_regs {
unsigned long r15, r14, r13, r12, rbp, rbx, r11, r10;
unsigned long r9, r8, rax, rcx, rdx, rsi, rdi, error;
unsigned long rip, cs, rflags, rsp, ss;
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long rax;
unsigned long rcx;
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
union {
unsigned long orig_rax; /* syscall */
unsigned long error; /* interrupts */
};
unsigned long rip;
unsigned long cs;
unsigned long rflags;
unsigned long rsp;
unsigned long ss;
};
struct x86_sregs {

View File

@ -18,6 +18,11 @@
#define _NSIG_BPW 64
#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
static inline int valid_signal(unsigned long sig)
{
return sig <= _NSIG ? 1 : 0;
}
typedef unsigned long int __sigset_t;
#define __sigmask(sig) (((__sigset_t) 1) << ((sig) - 1))

View File

@ -39,7 +39,7 @@ SYSCALL_HANDLED(15, rt_sigreturn)
SYSCALL_HANDLED(16, ioctl)
SYSCALL_DELEGATED(17, pread64)
SYSCALL_DELEGATED(18, pwrite64)
SYSCALL_DELEGATED(20, writev)
SYSCALL_HANDLED(20, writev)
SYSCALL_DELEGATED(21, access)
SYSCALL_DELEGATED(23, select)
SYSCALL_HANDLED(24, sched_yield)
@ -56,7 +56,7 @@ SYSCALL_HANDLED(36, getitimer)
SYSCALL_HANDLED(38, setitimer)
SYSCALL_HANDLED(39, getpid)
SYSCALL_HANDLED(56, clone)
SYSCALL_DELEGATED(57, fork)
SYSCALL_HANDLED(57, fork)
SYSCALL_HANDLED(58, vfork)
SYSCALL_HANDLED(59, execve)
SYSCALL_HANDLED(60, exit)

View File

@ -145,6 +145,8 @@ nmi:
movq %rsp,%gs:PANIC_REGS+0x08
movl nmi_mode(%rip),%eax
cmp $3,%rax
je 4f
cmp $1,%rax
je 1f
cmp $2,%rax
@ -199,9 +201,9 @@ nmi:
movl %eax,%gs:PANIC_REGS+0xA0
movq $1,%gs:PANICED
call ihk_mc_query_mem_areas
1:
4:
hlt
jmp 1b
jmp 4b
.globl x86_syscall
x86_syscall:

View File

@ -107,9 +107,17 @@ void init_boot_processor_local(void)
@ ensures \result == %gs;
@ assigns \nothing;
*/
extern int num_processors;
int ihk_mc_get_processor_id(void)
{
int id;
void *gs;
gs = (void *)rdmsr(MSR_GS_BASE);
if (gs < (void *)locals ||
gs > ((void *)locals + LOCALS_SPAN * num_processors)) {
return -1;
}
asm volatile("movl %%gs:0, %0" : "=r"(id));

View File

@ -41,6 +41,8 @@ extern char _head[], _end[];
extern unsigned long x86_kernel_phys_base;
int safe_kernel_map = 0;
/* Arch specific early allocation routine */
void *early_alloc_pages(int nr_pages)
{
@ -109,6 +111,7 @@ struct page_table {
};
static struct page_table *init_pt;
static int init_pt_loaded = 0;
static ihk_spinlock_t init_pt_lock;
static int use_1gb_page = 0;
@ -172,19 +175,23 @@ static void init_normal_area(struct page_table *pt)
unsigned long map_start, map_end, phys, pt_phys;
int ident_index, virt_index;
map_start = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_START, 0);
/*
* This has to start from 0x00, see load_file() in IHK-SMP.
* For security reasons, we could skip holes in the LWK
* assigned physical memory, but Linux mappings already map
* those anyway.
*/
map_start = 0;
map_end = ihk_mc_get_memory_address(IHK_MC_GMA_MAP_END, 0);
kprintf("map_start = %lx, map_end = %lx\n", map_start, map_end);
ident_index = map_start >> PTL4_SHIFT;
virt_index = (MAP_ST_START >> PTL4_SHIFT) & (PT_ENTRIES - 1);
memset(pt, 0, sizeof(struct page_table));
for (phys = (map_start & ~(PTL4_SIZE - 1)); phys < map_end;
phys += PTL4_SIZE) {
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL), phys,
map_start, map_end);
for (phys = map_start; phys < map_end; phys += PTL4_SIZE) {
pt_phys = setup_l3(ihk_mc_alloc_pages(1, IHK_MC_AP_CRITICAL),
phys, map_start, map_end);
pt->entry[ident_index++] = pt_phys | PFL4_PDIR_ATTR;
pt->entry[virt_index++] = pt_phys | PFL4_PDIR_ATTR;
@ -493,7 +500,7 @@ uint64_t ihk_mc_pt_virt_to_pagemap(struct page_table *pt, unsigned long virt)
error = ihk_mc_pt_virt_to_phys(pt, (void *)virt, &phys);
if (error) {
return 0;
return PM_PSHIFT(PAGE_SHIFT);
}
pagemap = PM_PFRAME(phys >> PAGE_SHIFT);
@ -724,6 +731,26 @@ static void destroy_page_table(int level, struct page_table *pt)
return;
}
void ihk_mc_pt_destroy_pgd_subtree(struct page_table *pt, void *virt)
{
int l4idx, l3idx, l2idx, l1idx;
unsigned long v = (unsigned long)virt;
struct page_table *lower;
GET_VIRT_INDICES(v, l4idx, l3idx, l2idx, l1idx);
if (!(pt->entry[l4idx] & PF_PRESENT))
return;
lower = (struct page_table *)
phys_to_virt(pt->entry[l4idx] & PT_PHYSMASK);
destroy_page_table(3, lower);
pt->entry[l4idx] = 0;
dkprintf("%s: virt: 0x%lx, l4idx: %d subtree destroyed\n",
__FUNCTION__, virt, l4idx);
}
void ihk_mc_pt_destroy(struct page_table *pt)
{
const int level = 4; /* PML4 */
@ -1542,7 +1569,6 @@ static int clear_range_l1(void *args0, pte_t *ptep, uint64_t base,
dkprintf("%lx-,%s: calling memory_stat_rss_sub(),phys=%lx,size=%ld,pgsize=%ld\n", pte_get_phys(&old), __FUNCTION__, pte_get_phys(&old), PTL1_SIZE, PTL1_SIZE);
rusage_memory_stat_sub(args->memobj, PTL1_SIZE, PTL1_SIZE);
}
args->vm->currss -= PTL1_SIZE;
} else {
dkprintf("%s: !calling memory_stat_rss_sub(),virt=%lx,phys=%lx\n", __FUNCTION__, base, pte_get_phys(&old));
}
@ -1611,7 +1637,6 @@ static int clear_range_l2(void *args0, pte_t *ptep, uint64_t base,
dkprintf("%lx-,%s: calling memory_stat_rss_sub(),phys=%lx,size=%ld,pgsize=%ld\n", pte_get_phys(&old), __FUNCTION__, pte_get_phys(&old), PTL2_SIZE, PTL2_SIZE);
rusage_memory_stat_sub(args->memobj, PTL2_SIZE, PTL2_SIZE);
}
args->vm->currss -= PTL2_SIZE;
}
}
@ -1693,7 +1718,6 @@ static int clear_range_l3(void *args0, pte_t *ptep, uint64_t base,
dkprintf("%lx-,%s: calling memory_stat_rss_sub(),phys=%lx,size=%ld,pgsize=%ld\n", pte_get_phys(&old), __FUNCTION__, pte_get_phys(&old), PTL3_SIZE, PTL3_SIZE);
rusage_memory_stat_sub(args->memobj, PTL3_SIZE, PTL3_SIZE);
}
args->vm->currss -= PTL3_SIZE;
}
}
@ -1963,6 +1987,28 @@ out:
return ptep;
}
pte_t *ihk_mc_pt_lookup_fault_pte(struct process_vm *vm, void *virt,
int pgshift, void **basep, size_t *sizep, int *p2alignp)
{
int faulted = 0;
pte_t *ptep;
retry:
ptep = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
virt, pgshift, basep, sizep, p2alignp);
if (!faulted && (!ptep || !pte_is_present(ptep))) {
page_fault_process_vm(vm, virt, PF_POPULATE | PF_USER);
faulted = 1;
goto retry;
}
if (faulted && ptep && pte_is_present(ptep)) {
kprintf("%s: successfully faulted 0x%lx\n", __FUNCTION__, virt);
}
return ptep;
}
pte_t *ihk_mc_pt_lookup_pte(page_table_t pt, void *virt, int pgshift,
void **basep, size_t *sizep, int *p2alignp)
{
@ -2262,7 +2308,7 @@ out:
int ihk_mc_pt_set_range(page_table_t pt, struct process_vm *vm, void *start,
void *end, uintptr_t phys, enum ihk_mc_pt_attribute attr,
int pgshift, struct vm_range *range)
int pgshift, struct vm_range *range)
{
int error;
struct set_range_args args;
@ -2606,6 +2652,61 @@ void init_low_area(struct page_table *pt)
set_pt_large_page(pt, 0, 0, PTATTR_NO_EXECUTE|PTATTR_WRITABLE);
}
static void init_linux_kernel_mapping(struct page_table *pt)
{
unsigned long map_start, map_end, phys;
void *virt;
int nr_memory_chunks, chunk_id, numa_id;
/* In case of safe_kernel_map option (safe_kernel_map == 1),
processing to prevent destruction of the memory area on Linux side
is executed */
if (safe_kernel_map == 0) {
kprintf("Straight-map entire physical memory\n");
/* Map 2 TB for now */
map_start = 0;
map_end = 0x20000000000;
virt = (void *)LINUX_PAGE_OFFSET;
kprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
LINUX_PAGE_OFFSET, LINUX_PAGE_OFFSET + map_end, 0, map_end);
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE) {
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
kprintf("%s: error setting mapping for 0x%lx\n", __FUNCTION__, virt);
}
virt += LARGE_PAGE_SIZE;
}
} else {
kprintf("Straight-map physical memory areas allocated to McKernel\n");
nr_memory_chunks = ihk_mc_get_nr_memory_chunks();
if (nr_memory_chunks == 0) {
kprintf("%s: ERROR: No memory chunk available.\n", __FUNCTION__);
return;
}
for (chunk_id = 0; chunk_id < nr_memory_chunks; chunk_id++) {
if (ihk_mc_get_memory_chunk(chunk_id, &map_start, &map_end, &numa_id)) {
kprintf("%s: ERROR: Memory chunk id (%d) out of range.\n", __FUNCTION__, chunk_id);
continue;
}
dkprintf("Linux kernel virtual: 0x%lx - 0x%lx -> 0x%lx - 0x%lx\n",
LINUX_PAGE_OFFSET + map_start, LINUX_PAGE_OFFSET + map_end, map_start, map_end);
virt = (void *)(LINUX_PAGE_OFFSET + map_start);
for (phys = map_start; phys < map_end; phys += LARGE_PAGE_SIZE, virt += LARGE_PAGE_SIZE) {
if (set_pt_large_page(pt, virt, phys, PTATTR_WRITABLE) != 0) {
kprintf("%s: set_pt_large_page() failed for 0x%lx\n", __FUNCTION__, virt);
}
}
}
}
}
static void init_vsyscall_area(struct page_table *pt)
{
extern char vsyscall_page[];
@ -2631,13 +2732,15 @@ void init_page_table(void)
/* Normal memory area */
init_normal_area(init_pt);
init_linux_kernel_mapping(init_pt);
init_fixed_area(init_pt);
init_low_area(init_pt);
init_text_area(init_pt);
init_vsyscall_area(init_pt);
load_page_table(init_pt);
kprintf("Page table is now at %p\n", init_pt);
init_pt_loaded = 1;
kprintf("Page table is now at 0x%lx\n", init_pt);
}
extern void __reserve_arch_pages(unsigned long, unsigned long,
@ -2665,17 +2768,33 @@ void ihk_mc_reserve_arch_pages(struct ihk_page_allocator_desc *pa_allocator,
unsigned long virt_to_phys(void *v)
{
unsigned long va = (unsigned long)v;
if (va >= MAP_KERNEL_START) {
dkprintf("%s: MAP_KERNEL_START <= 0x%lx <= LINUX_PAGE_OFFSET\n",
__FUNCTION__, va);
return va - MAP_KERNEL_START + x86_kernel_phys_base;
} else {
}
else if (va >= LINUX_PAGE_OFFSET) {
return va - LINUX_PAGE_OFFSET;
}
else if (va >= MAP_FIXED_START) {
return va - MAP_FIXED_START;
}
else {
dkprintf("%s: MAP_ST_START <= 0x%lx <= MAP_FIXED_START\n",
__FUNCTION__, va);
return va - MAP_ST_START;
}
}
void *phys_to_virt(unsigned long p)
{
return (void *)(p + MAP_ST_START);
/* Before loading our own PT use straight mapping */
if (!init_pt_loaded) {
return (void *)(p + MAP_ST_START);
}
return (void *)(p + LINUX_PAGE_OFFSET);
}
int copy_from_user(void *dst, const void *src, size_t siz)

View File

@ -10,9 +10,12 @@
#include <ihk/perfctr.h>
#include <march.h>
#include <errno.h>
#include <cls.h>
#include <ihk/debug.h>
#include <ihk/cpu.h>
#include <registers.h>
#include <mc_perf_event.h>
#include <config.h>
extern unsigned int *x86_march_perfmap;
extern int running_on_kvm(void);
@ -57,6 +60,10 @@ void x86_init_perfctr(void)
uint64_t ecx;
uint64_t edx;
#ifndef ENABLE_PERF
return;
#endif //ENABLE_PERF
/* Do not do it on KVM */
if (running_on_kvm()) return;
@ -93,7 +100,7 @@ void x86_init_perfctr(void)
for(i = 0; i < X86_IA32_NUM_PERF_COUNTERS; i++) {
wrmsr(MSR_IA32_PERFEVTSEL0 + i, 0);
}
/* Enable PMC Control */
value = rdmsr(MSR_PERF_GLOBAL_CTRL);
value |= X86_IA32_PERF_COUNTERS_MASK;
@ -254,6 +261,41 @@ int ihk_mc_perfctr_init(int counter, enum ihk_perfctr_type type, int mode)
return set_perfctr_x86_direct(counter, mode, x86_march_perfmap[type]);
}
int ihk_mc_perfctr_set_extra(struct mc_perf_event *event)
{
struct thread *thread = cpu_local_var(current);
// allocate extra_reg
if (thread->extra_reg_alloc_map & (1UL << event->extra_reg.idx)) {
if (event->extra_reg.idx == EXTRA_REG_RSP_0) {
event->extra_reg.idx = EXTRA_REG_RSP_1;
}
else if (event->extra_reg.idx == EXTRA_REG_RSP_1) {
event->extra_reg.idx = EXTRA_REG_RSP_0;
}
if (thread->extra_reg_alloc_map & (1UL << event->extra_reg.idx)) {
// extra_regs are full
return -1;
}
}
if (event->extra_reg.idx == EXTRA_REG_RSP_0) {
event->hw_config &= ~0xffUL;
event->hw_config |= ihk_mc_get_extra_reg_event(EXTRA_REG_RSP_0);
event->extra_reg.reg = MSR_OFFCORE_RSP_0;
}
else if (event->extra_reg.idx == EXTRA_REG_RSP_1) {
event->hw_config &= ~0xffUL;
event->hw_config |= ihk_mc_get_extra_reg_event(EXTRA_REG_RSP_1);
event->extra_reg.reg = MSR_OFFCORE_RSP_1;
}
thread->extra_reg_alloc_map |= (1UL << event->extra_reg.idx);
wrmsr(event->extra_reg.reg, event->extra_reg.config);
return 0;
}
#ifdef HAVE_MARCH_PERFCTR_START
extern void x86_march_perfctr_start(unsigned long counter_mask);
#endif

View File

@ -29,11 +29,12 @@
#include <prctl.h>
#include <ihk/ikc.h>
#include <page.h>
#include <limits.h>
#include <syscall.h>
void terminate(int, int);
void terminate_mcexec(int, int);
extern long do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact);
long syscall(int num, ihk_mc_user_context_t *ctx);
extern void save_fp_regs(struct thread *proc);
void set_signal(int sig, void *regs0, siginfo_t *info);
void check_signal(unsigned long rc, void *regs0, int num);
extern unsigned long do_fork(int, unsigned long, unsigned long, unsigned long,
@ -142,8 +143,6 @@ SYSCALL_DECLARE(rt_sigaction)
struct k_sigaction new_sa, old_sa;
int rc;
if(sig == SIGKILL || sig == SIGSTOP || sig <= 0 || sig > 64)
return -EINVAL;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@ -251,8 +250,8 @@ SYSCALL_DECLARE(rt_sigreturn)
regs->gpr.rflags &= ~RFLAGS_TF;
info.si_code = TRAP_TRACE;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
check_signal(0, regs, 0);
}
if(ksigsp.fpregs && xsavesize){
@ -279,6 +278,7 @@ SYSCALL_DECLARE(rt_sigreturn)
extern struct cpu_local_var *clv;
extern unsigned long do_kill(struct thread *thread, int pid, int tid, int sig, struct siginfo *info, int ptracecont);
extern void interrupt_syscall(struct thread *, int sig);
extern void terminate(int, int);
extern int num_processors;
#define RFLAGS_MASK (RFLAGS_CF | RFLAGS_PF | RFLAGS_AF | RFLAGS_ZF | \
@ -460,7 +460,6 @@ void set_single_step(struct thread *thread)
long ptrace_read_fpregs(struct thread *thread, void *fpregs)
{
save_fp_regs(thread);
if (thread->fp_regs == NULL) {
return -ENOMEM;
}
@ -470,7 +469,6 @@ long ptrace_read_fpregs(struct thread *thread, void *fpregs)
long ptrace_write_fpregs(struct thread *thread, void *fpregs)
{
save_fp_regs(thread);
if (thread->fp_regs == NULL) {
return -ENOMEM;
}
@ -540,7 +538,7 @@ void ptrace_report_signal(struct thread *thread, int sig)
/* Transition thread state */
proc->status = PS_TRACED;
thread->status = PS_TRACED;
proc->ptrace &= ~PT_TRACE_SYSCALL_MASK;
proc->ptrace &= ~PT_TRACE_SYSCALL;
if (sig == SIGSTOP || sig == SIGTSTP ||
sig == SIGTTIN || sig == SIGTTOU) {
proc->signal_flags |= SIGNAL_STOP_STOPPED;
@ -809,6 +807,11 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
regs->gpr.rip = (unsigned long)k->sa.sa_handler;
regs->gpr.rsp = (unsigned long)usp;
// check signal handler is ONESHOT
if (k->sa.sa_flags & SA_RESETHAND) {
k->sa.sa_handler = SIG_DFL;
}
if(!(k->sa.sa_flags & SA_NODEFER))
thread->sigmask.__val[0] |= pending->sigmask.__val[0];
kfree(pending);
@ -820,8 +823,8 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
regs->gpr.rflags &= ~RFLAGS_TF;
info.si_code = TRAP_TRACE;
set_signal(SIGTRAP, regs, &info);
check_signal(0, regs, 0);
check_need_resched();
check_signal(0, regs, 0);
}
}
else {
@ -927,6 +930,7 @@ do_signal(unsigned long rc, void *regs0, struct thread *thread, struct sig_pendi
break;
case SIGCHLD:
case SIGURG:
case SIGWINCH:
break;
default:
dkprintf("do_signal,default,terminate,sig=%d\n", sig);
@ -1007,6 +1011,12 @@ interrupt_from_user(void *regs0)
return !(regs->gpr.rsp & 0x8000000000000000);
}
void save_syscall_return_value(int num, unsigned long rc)
{
/* Empty on x86 */
return;
}
void
check_signal(unsigned long rc, void *regs0, int num)
{
@ -1054,6 +1064,110 @@ out:
return;
}
static int
check_sig_pending_thread(struct thread *thread)
{
int found = 0;
struct list_head *head;
mcs_rwlock_lock_t *lock;
struct mcs_rwlock_node_irqsave mcs_rw_node;
struct sig_pending *next;
struct sig_pending *pending;
__sigset_t w;
__sigset_t x;
int sig = 0;
struct k_sigaction *k;
struct cpu_local_var *v;
v = get_this_cpu_local_var();
w = thread->sigmask.__val[0];
lock = &thread->sigcommon->lock;
head = &thread->sigcommon->sigpending;
for (;;) {
mcs_rwlock_reader_lock(lock, &mcs_rw_node);
list_for_each_entry_safe(pending, next, head, list){
for (x = pending->sigmask.__val[0], sig = 0; x;
sig++, x >>= 1);
k = thread->sigcommon->action + sig - 1;
if ((sig != SIGCHLD && sig != SIGURG) ||
(k->sa.sa_handler != (void *)1 &&
k->sa.sa_handler != NULL)) {
if (!(pending->sigmask.__val[0] & w)) {
if (pending->interrupted == 0) {
pending->interrupted = 1;
found = 1;
if (sig != SIGCHLD &&
sig != SIGURG &&
!k->sa.sa_handler) {
found = 2;
break;
}
}
}
}
}
mcs_rwlock_reader_unlock(lock, &mcs_rw_node);
if (found == 2) {
break;
}
if (lock == &thread->sigpendinglock) {
break;
}
lock = &thread->sigpendinglock;
head = &thread->sigpending;
}
if (found == 2) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
terminate_mcexec(0, sig);
return 1;
}
else if (found == 1) {
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
interrupt_syscall(thread, 0);
return 1;
}
return 0;
}
void
check_sig_pending()
{
struct thread *thread;
struct cpu_local_var *v;
if (clv == NULL)
return;
v = get_this_cpu_local_var();
repeat:
v->runq_irqstate = ihk_mc_spinlock_lock(&v->runq_lock);
list_for_each_entry(thread, &(v->runq), sched_list) {
if (thread == NULL || thread == &cpu_local_var(idle)) {
continue;
}
if (thread->in_syscall_offload == 0) {
continue;
}
if (thread->proc->exit_status & 0x0000000100000000L) {
continue;
}
if (check_sig_pending_thread(thread))
goto repeat;
}
ihk_mc_spinlock_unlock(&v->runq_lock, v->runq_irqstate);
}
unsigned long
do_kill(struct thread *thread, int pid, int tid, int sig, siginfo_t *info,
int ptracecont)
@ -1215,15 +1329,19 @@ done:
mcs_rwlock_reader_lock_noirq(&tproc->update_lock, &updatelock);
savelock = &tthread->sigpendinglock;
head = &tthread->sigpending;
if(sig == SIGKILL ||
(tproc->status != PS_EXITED &&
tproc->status != PS_ZOMBIE &&
tthread->status != PS_EXITED)){
hold_thread(tthread);
mcs_rwlock_reader_lock_noirq(&tproc->threads_lock, &lock);
if (tthread->status != PS_EXITED &&
(sig == SIGKILL ||
(tproc->status != PS_EXITED && tproc->status != PS_ZOMBIE))) {
if ((rc = hold_thread(tthread))) {
kprintf("%s: ERROR hold_thread returned %d,tid=%d\n", __FUNCTION__, rc, tthread->tid);
tthread = NULL;
}
}
else{
tthread = NULL;
}
mcs_rwlock_reader_unlock_noirq(&tproc->threads_lock, &lock);
mcs_rwlock_reader_unlock_noirq(&tproc->update_lock, &updatelock);
mcs_rwlock_reader_unlock_noirq(&thash->lock[hash], &lock);
}
@ -1250,7 +1368,9 @@ done:
}
if (tthread->thread_offloaded) {
interrupt_syscall(tthread, sig);
if (!tthread->proc->nohost) {
interrupt_syscall(tthread, sig);
}
release_thread(tthread);
return 0;
}
@ -1285,6 +1405,7 @@ done:
rc = -ENOMEM;
}
else{
memset(pending, 0, sizeof(struct sig_pending));
pending->sigmask.__val[0] = mask;
memcpy(&pending->info, info, sizeof(siginfo_t));
pending->ptracecont = ptracecont;
@ -1308,9 +1429,6 @@ done:
ihk_mc_interrupt_cpu(get_x86_cpu_local_variable(tthread->cpu_id)->apic_id, 0xd0);
}
if(!tthread->proc->nohost)
interrupt_syscall(tthread, 0);
if (status != PS_RUNNING) {
if(sig == SIGKILL){
/* Wake up the target only when stopped by ptrace-reporting */
@ -1336,15 +1454,19 @@ set_signal(int sig, void *regs0, siginfo_t *info)
struct x86_user_context *regs = regs0;
struct thread *thread = cpu_local_var(current);
if(thread == NULL || thread->proc->pid == 0)
if (thread == NULL || thread->proc->pid == 0)
return;
if((__sigmask(sig) & thread->sigmask.__val[0]) ||
(regs->gpr.rsp & 0x8000000000000000)){
if (!interrupt_from_user(regs)) {
ihk_mc_debug_show_interrupt_context(regs);
panic("panic: kernel mode signal");
}
if ((__sigmask(sig) & thread->sigmask.__val[0])) {
coredump(thread, regs0);
terminate(0, sig | 0x80);
}
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
do_kill(thread, thread->proc->pid, thread->tid, sig, info, 0);
}
SYSCALL_DECLARE(mmap)
@ -1483,6 +1605,16 @@ SYSCALL_DECLARE(clone)
ihk_mc_syscall_sp(ctx));
}
SYSCALL_DECLARE(fork)
{
return do_fork(SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
}
SYSCALL_DECLARE(vfork)
{
return do_fork(CLONE_VFORK|SIGCHLD, 0, 0, 0, 0, ihk_mc_syscall_pc(ctx), ihk_mc_syscall_sp(ctx));
}
SYSCALL_DECLARE(shmget)
{
const key_t key = ihk_mc_syscall_arg0(ctx);
@ -1907,4 +2039,644 @@ save_uctx(void *uctx, struct x86_user_context *regs)
ctx->fregsize = 0;
}
int do_process_vm_read_writev(int pid,
const struct iovec *local_iov,
unsigned long liovcnt,
const struct iovec *remote_iov,
unsigned long riovcnt,
unsigned long flags,
int op)
{
int ret = -EINVAL;
int li, ri;
int pli, pri;
off_t loff, roff;
size_t llen = 0, rlen = 0;
size_t copied = 0;
size_t to_copy;
struct thread *lthread = cpu_local_var(current);
struct process *rproc;
struct process *lproc = lthread->proc;
struct process_vm *rvm = NULL;
unsigned long rphys;
unsigned long rpage_left;
unsigned long psize;
void *rva;
struct vm_range *range;
struct mcs_rwlock_node_irqsave lock;
struct mcs_rwlock_node update_lock;
/* Sanity checks */
if (flags) {
return -EINVAL;
}
if (liovcnt > IOV_MAX || riovcnt > IOV_MAX) {
return -EINVAL;
}
/* Check if parameters are okay */
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov,
(uintptr_t)(local_iov + liovcnt * sizeof(struct iovec)));
if (!range) {
ret = -EFAULT;
goto arg_out;
}
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)remote_iov,
(uintptr_t)(remote_iov + riovcnt * sizeof(struct iovec)));
if (!range) {
ret = -EFAULT;
goto arg_out;
}
ret = 0;
arg_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
}
for (li = 0; li < liovcnt; ++li) {
llen += local_iov[li].iov_len;
dkprintf("local_iov[%d].iov_base: 0x%lx, len: %lu\n",
li, local_iov[li].iov_base, local_iov[li].iov_len);
}
for (ri = 0; ri < riovcnt; ++ri) {
rlen += remote_iov[ri].iov_len;
dkprintf("remote_iov[%d].iov_base: 0x%lx, len: %lu\n",
ri, remote_iov[ri].iov_base, remote_iov[ri].iov_len);
}
if (llen != rlen) {
return -EINVAL;
}
/* Find remote process */
rproc = find_process(pid, &lock);
if (!rproc) {
ret = -ESRCH;
goto out;
}
mcs_rwlock_reader_lock_noirq(&rproc->update_lock, &update_lock);
if(rproc->status == PS_EXITED ||
rproc->status == PS_ZOMBIE){
mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock);
process_unlock(rproc, &lock);
ret = -ESRCH;
goto out;
}
rvm = rproc->vm;
hold_process_vm(rvm);
mcs_rwlock_reader_unlock_noirq(&rproc->update_lock, &update_lock);
process_unlock(rproc, &lock);
if (lproc->euid != 0 &&
(lproc->ruid != rproc->ruid ||
lproc->ruid != rproc->euid ||
lproc->ruid != rproc->suid ||
lproc->rgid != rproc->rgid ||
lproc->rgid != rproc->egid ||
lproc->rgid != rproc->sgid)) {
ret = -EPERM;
goto out;
}
dkprintf("pid %d found, doing %s: liovcnt: %d, riovcnt: %d \n", pid,
(op == PROCESS_VM_READ) ? "PROCESS_VM_READ" : "PROCESS_VM_WRITE",
liovcnt, riovcnt);
pli = pri = -1; /* Previous indeces in iovecs */
li = ri = 0; /* Current indeces in iovecs */
loff = roff = 0; /* Offsets in current iovec */
/* Now iterate and do the copy */
while (copied < llen) {
int faulted = 0;
/* New local vector? */
if (pli != li) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&lthread->vm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov[li].iov_base,
(uintptr_t)(local_iov[li].iov_base + 1));
if (!range) {
ret = -EFAULT;
goto pli_out;
}
/* Is range valid? */
range = lookup_process_memory_range(lthread->vm,
(uintptr_t)local_iov[li].iov_base,
(uintptr_t)(local_iov[li].iov_base + local_iov[li].iov_len));
if (range == NULL) {
ret = -EINVAL;
goto pli_out;
}
if (!(range->flag & ((op == PROCESS_VM_READ) ?
VR_PROT_WRITE : VR_PROT_READ))) {
ret = -EFAULT;
goto pli_out;
}
ret = 0;
pli_out:
ihk_mc_spinlock_unlock_noirq(&lthread->vm->memory_range_lock);
if (ret != 0) {
goto out;
}
pli = li;
}
/* New remote vector? */
if (pri != ri) {
struct vm_range *range;
ihk_mc_spinlock_lock_noirq(&rvm->memory_range_lock);
/* Is base valid? */
range = lookup_process_memory_range(rvm,
(uintptr_t)remote_iov[li].iov_base,
(uintptr_t)(remote_iov[li].iov_base + 1));
if (range == NULL) {
ret = -EFAULT;
goto pri_out;
}
/* Is range valid? */
range = lookup_process_memory_range(rvm,
(uintptr_t)remote_iov[li].iov_base,
(uintptr_t)(remote_iov[li].iov_base + remote_iov[li].iov_len));
if (range == NULL) {
ret = -EINVAL;
goto pri_out;
}
if (!(range->flag & ((op == PROCESS_VM_READ) ?
VR_PROT_READ : VR_PROT_WRITE))) {
ret = -EFAULT;
goto pri_out;
}
ret = 0;
pri_out:
ihk_mc_spinlock_unlock_noirq(&rvm->memory_range_lock);
if (ret != 0) {
goto out;
}
pri = ri;
}
/* Figure out how much we can copy at most in this iteration */
to_copy = (local_iov[li].iov_len - loff);
if ((remote_iov[ri].iov_len - roff) < to_copy) {
to_copy = remote_iov[ri].iov_len - roff;
}
retry_lookup:
/* TODO: remember page and do this only if necessary */
ret = ihk_mc_pt_virt_to_phys_size(rvm->address_space->page_table,
remote_iov[ri].iov_base + roff, &rphys, &psize);
if (ret) {
uint64_t reason = PF_POPULATE | PF_WRITE | PF_USER;
void *addr;
if (faulted) {
ret = -EFAULT;
goto out;
}
/* Fault in pages */
for (addr = (void *)
(((unsigned long)remote_iov[ri].iov_base + roff)
& PAGE_MASK);
addr < (remote_iov[ri].iov_base + roff + to_copy);
addr += PAGE_SIZE) {
ret = page_fault_process_vm(rvm, addr, reason);
if (ret) {
ret = -EFAULT;
goto out;
}
}
faulted = 1;
goto retry_lookup;
}
rpage_left = ((((unsigned long)remote_iov[ri].iov_base + roff +
psize) & ~(psize - 1)) -
((unsigned long)remote_iov[ri].iov_base + roff));
if (rpage_left < to_copy) {
to_copy = rpage_left;
}
rva = phys_to_virt(rphys);
fast_memcpy(
(op == PROCESS_VM_READ) ? local_iov[li].iov_base + loff : rva,
(op == PROCESS_VM_READ) ? rva : local_iov[li].iov_base + loff,
to_copy);
copied += to_copy;
dkprintf("local_iov[%d]: 0x%lx %s remote_iov[%d]: 0x%lx, %lu copied, psize: %lu, rpage_left: %lu\n",
li, local_iov[li].iov_base + loff,
(op == PROCESS_VM_READ) ? "<-" : "->",
ri, remote_iov[ri].iov_base + roff, to_copy,
psize, rpage_left);
loff += to_copy;
roff += to_copy;
if (loff == local_iov[li].iov_len) {
li++;
loff = 0;
}
if (roff == remote_iov[ri].iov_len) {
ri++;
roff = 0;
}
}
release_process_vm(rvm);
return copied;
out:
if(rvm)
release_process_vm(rvm);
return ret;
}
int move_pages_smp_handler(int cpu_index, int nr_cpus, void *arg)
{
int i, i_s, i_e, phase = 1;
struct move_pages_smp_req *mpsr =
(struct move_pages_smp_req *)arg;
struct process_vm *vm = mpsr->proc->vm;
int count = mpsr->count;
struct page_table *save_pt;
extern struct page_table *get_init_page_table(void);
i_s = (count / nr_cpus) * cpu_index;
i_e = i_s + (count / nr_cpus);
if (cpu_index == (nr_cpus - 1)) {
i_e = count;
}
/* Load target process' PT so that we can access user-space */
save_pt = cpu_local_var(current) == &cpu_local_var(idle) ?
get_init_page_table() :
cpu_local_var(current)->vm->address_space->page_table;
if (save_pt != vm->address_space->page_table) {
ihk_mc_load_page_table(vm->address_space->page_table);
}
else {
save_pt = NULL;
}
if (nr_cpus == 1) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus > 1 && nr_cpus < 4) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
case 1:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
mpsr->nodes_ready = 1;
break;
default:
break;
}
}
else if (nr_cpus >= 4 && nr_cpus < 8) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * count);
break;
case 1:
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
break;
case 2:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
mpsr->nodes_ready = 1;
break;
case 3:
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
memset(mpsr->status, 0, sizeof(int) * count);
memset(mpsr->nr_pages, 0, sizeof(int) * count);
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
break;
default:
break;
}
}
else if (nr_cpus >= 8) {
switch (cpu_index) {
case 0:
memcpy(mpsr->virt_addr, mpsr->user_virt_addr,
sizeof(void *) * (count / 2));
break;
case 1:
memcpy(mpsr->virt_addr + (count / 2),
mpsr->user_virt_addr + (count / 2),
sizeof(void *) * (count / 2));
break;
case 2:
memcpy(mpsr->status, mpsr->user_status,
sizeof(int) * count);
break;
case 3:
memcpy(mpsr->nodes, mpsr->user_nodes,
sizeof(int) * count);
mpsr->nodes_ready = 1;
break;
case 4:
memset(mpsr->ptep, 0, sizeof(pte_t) * count);
break;
case 5:
memset(mpsr->status, 0, sizeof(int) * count);
break;
case 6:
memset(mpsr->nr_pages, 0, sizeof(int) * count);
break;
case 7:
memset(mpsr->dst_phys, 0,
sizeof(unsigned long) * count);
break;
default:
break;
}
}
while (!(volatile int)mpsr->nodes_ready) {
cpu_pause();
}
/* NUMA verification in parallel */
for (i = i_s; i < i_e; i++) {
if (mpsr->nodes[i] < 0 ||
mpsr->nodes[i] >= ihk_mc_get_nr_numa_nodes() ||
!test_bit(mpsr->nodes[i],
mpsr->proc->vm->numa_mask)) {
mpsr->phase_ret = -EINVAL;
break;
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* PTE lookup in parallel */
for (i = i_s; i < i_e; i++) {
void *phys;
size_t pgsize;
int p2align;
/*
* XXX: No page structures for anonymous mappings.
* Look up physical addresses by scanning page tables.
*/
mpsr->ptep[i] = ihk_mc_pt_lookup_pte(vm->address_space->page_table,
(void *)mpsr->virt_addr[i], 0, &phys, &pgsize, &p2align);
/* PTE valid? */
if (!mpsr->ptep[i] || !pte_is_present(mpsr->ptep[i])) {
mpsr->status[i] = -ENOENT;
mpsr->ptep[i] = NULL;
continue;
}
/* PTE is file? */
if (pte_is_fileoff(mpsr->ptep[i], PAGE_SIZE)) {
mpsr->status[i] = -EINVAL;
mpsr->ptep[i] = NULL;
continue;
}
dkprintf("%s: virt 0x%lx:%lu requested to be moved to node %d\n",
__FUNCTION__, mpsr->virt_addr[i], pgsize, mpsr->nodes[i]);
/* Large page? */
if (pgsize > PAGE_SIZE) {
int nr_sub_pages = (pgsize / PAGE_SIZE);
int j;
if (i + nr_sub_pages > count) {
kprintf("%s: ERROR: page at index %d exceeds the region\n",
__FUNCTION__, i);
mpsr->status[i] = -EINVAL;
break;
}
/* Is it contiguous across nr_sub_pages and all
* requested to be moved to the same target node? */
for (j = 0; j < nr_sub_pages; ++j) {
if (mpsr->virt_addr[i + j] !=
(mpsr->virt_addr[i] + (j * PAGE_SIZE)) ||
mpsr->nodes[i] != mpsr->nodes[i + j]) {
kprintf("%s: ERROR: virt address or node at index %d"
" is inconsistent\n",
__FUNCTION__, i + j);
mpsr->phase_ret = -EINVAL;
goto pte_out;
}
}
mpsr->nr_pages[i] = nr_sub_pages;
i += (nr_sub_pages - 1);
}
else {
mpsr->nr_pages[i] = 1;
}
}
pte_out:
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
if (cpu_index == 0) {
/* Allocate new pages on target NUMA nodes */
for (i = 0; i < count; i++) {
int pgalign = 0;
int j;
void *dst;
if (!mpsr->ptep[i] || mpsr->status[i] < 0 || !mpsr->nr_pages[i])
continue;
/* TODO: store pgalign info in an array as well? */
if (mpsr->nr_pages[i] > 1) {
if (mpsr->nr_pages[i] * PAGE_SIZE == PTL2_SIZE)
pgalign = PTL2_SHIFT - PTL1_SHIFT;
}
dst = ihk_mc_alloc_aligned_pages_node(mpsr->nr_pages[i],
pgalign, IHK_MC_AP_USER, mpsr->nodes[i]);
if (!dst) {
mpsr->status[i] = -ENOMEM;
continue;
}
for (j = i; j < (i + mpsr->nr_pages[i]); ++j) {
mpsr->status[j] = mpsr->nodes[i];
}
mpsr->dst_phys[i] = virt_to_phys(dst);
dkprintf("%s: virt 0x%lx:%lu to node %d, pgalign: %d,"
" allocated phys: 0x%lx\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->nodes[i], pgalign, mpsr->dst_phys[i]);
}
}
/* Barrier */
ihk_atomic_inc(&mpsr->phase_done);
while (ihk_atomic_read(&mpsr->phase_done) <
(phase * nr_cpus)) {
cpu_pause();
}
if (mpsr->phase_ret != 0) {
goto out;
}
dkprintf("%s: phase %d done\n", __FUNCTION__, phase);
++phase;
/* Copy, PTE update, memfree in parallel */
for (i = i_s; i < i_e; ++i) {
if (!mpsr->dst_phys[i])
continue;
fast_memcpy(phys_to_virt(mpsr->dst_phys[i]),
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i] * PAGE_SIZE);
ihk_mc_free_pages(
phys_to_virt(pte_get_phys(mpsr->ptep[i])),
mpsr->nr_pages[i]);
pte_update_phys(mpsr->ptep[i], mpsr->dst_phys[i]);
dkprintf("%s: virt 0x%lx:%lu copied and remapped to phys: 0x%lu\n",
__FUNCTION__, mpsr->virt_addr[i],
mpsr->nr_pages[i] * PAGE_SIZE,
mpsr->dst_phys[i]);
}
/* XXX: do a separate SMP call with only CPUs running threads
* of this process? */
if (cpu_local_var(current)->proc == mpsr->proc) {
/* Invalidate all TLBs */
for (i = 0; i < mpsr->count; i++) {
if (!mpsr->dst_phys[i])
continue;
flush_tlb_single((unsigned long)mpsr->virt_addr[i]);
}
}
out:
if (save_pt) {
ihk_mc_load_page_table(save_pt);
}
return mpsr->phase_ret;
}
time_t time(void) {
struct syscall_request sreq IHK_DMA_ALIGN;
struct thread *thread = cpu_local_var(current);
time_t ret;
sreq.number = __NR_time;
sreq.args[0] = (uintptr_t)NULL;
ret = (time_t)do_syscall(&sreq, ihk_mc_get_processor_id(), thread->proc->pid);
return ret;
}
/*** End of File ***/

View File

@ -102,7 +102,7 @@ int vsyscall_gettimeofday(struct timeval *tv, void *tz)
: "%rcx", "%r11", "memory");
if (error) {
*(int *)0 = 0; /* i.e. raise(SIGSEGV) */
*(volatile int *)0 = 0; /* i.e. raise(SIGSEGV) */
}
return error;
} /* vsyscall_gettimeofday() */

View File

@ -45,7 +45,6 @@ error_exit() {
exit 1
}
fi
if [ ! -e /tmp/mcos ]; then

View File

@ -19,6 +19,7 @@ ETCDIR=@ETCDIR@
KMODDIR="${prefix}/kmod"
KERNDIR="${prefix}/@TARGET@/kernel"
ENABLE_MCOVERLAYFS="@ENABLE_MCOVERLAYFS@"
MCK_BUILDID=@BUILDID@
mem="512M@0"
cpus=""
@ -43,8 +44,12 @@ fi
turbo=""
ihk_irq=""
safe_kernel_map=""
umask_old=`umask`
idle_halt=""
allow_oversubscribe=""
while getopts :tk:c:m:o:f:r:q:i:d: OPT
while getopts :stk:c:m:o:f:r:q:i:d:e:hO OPT
do
case ${OPT} in
f) facility=${OPTARG}
@ -57,21 +62,32 @@ do
;;
m) mem=${OPTARG}
;;
s) safe_kernel_map="safe_kernel_map"
;;
r) ikc_map=${OPTARG}
;;
q) ihk_irq=${OPTARG}
;;
t) turbo="turbo"
;;
e) extra_kopts=${OPTARG}
;;
d) DUMP_LEVEL=${OPTARG}
;;
i) mon_interval=${OPTARG}
;;
h) idle_halt="idle_halt"
;;
O) allow_oversubscribe="allow_oversubscribe"
;;
*) echo "invalid option -${OPT}" >&2
exit 1
esac
done
redirect_kmsg=0
turbo="turbo"
# Start ihkmond
pid=`pidof ihkmond`
if [ "${pid}" != "" ]; then
@ -80,6 +96,16 @@ fi
if [ "${redirect_kmsg}" != "0" -o "${mon_interval}" != "-1" ]; then
${SBINDIR}/ihkmond -f ${facility} -k ${redirect_kmsg} -i ${mon_interval}
fi
disable_irqbalance_mck() {
if [ -f /etc/systemd/system/irqbalance_mck.service ]; then
systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null
# Invalid .service file persists so remove it
rm -f /etc/systemd/system/irqbalance_mck.service
fi
}
#
# Revert any state that has been initialized before the error occured.
#
@ -87,6 +113,16 @@ error_exit() {
local status=$1
case $status in
irqbalance_mck_started)
if [ "${irqbalance_used}" == "yes" ]; then
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
disable_irqbalance_mck
fi
fi
;&
mcos_sys_mounted)
if [ "$enable_mcoverlay" == "yes" ]; then
umount /tmp/mcos/mcos0_sys
@ -148,25 +184,25 @@ error_exit() {
fi
;&
ihk_smp_loaded)
rmmod ihk_smp_x86 2>/dev/null || echo "warning: failed to remove ihk_smp_x86" >&2
rmmod ihk_smp_@ARCH@ 2>/dev/null || echo "warning: failed to remove ihk_smp_@ARCH@" >&2
;&
ihk_loaded)
rmmod ihk 2>/dev/null || echo "warning: failed to remove ihk" >&2
;&
smp_affinity_modified)
umask $umask_old
if [ "${irqbalance_used}" == "yes" ]; then
if ! perl -e '$tmpdir="/tmp/mcreboot"; @files = grep { -f } glob "$tmpdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($tmpdir)); if (0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if [ -e /tmp/mcreboot ]; then rm -rf /tmp/mcreboot; fi
fi
;&
irqbalance_stopped)
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
if [ "${irqbalance_used}" == "yes" ]; then
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi
fi
;&
initial)
@ -200,7 +236,7 @@ if [ "${ENABLE_MCOVERLAYFS}" == "yes" ]; then
enable_mcoverlay="yes"
fi
else
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 ]; then
if [ ${linux_version_code} -eq 199168 -a ${rhel_release} -ge 327 -a ${rhel_release} -le 693 ]; then
enable_mcoverlay="yes"
fi
if [ ${linux_version_code} -ge 262144 -a ${linux_version_code} -lt 262400 ]; then
@ -236,9 +272,9 @@ if [ "${irqbalance_used}" == "yes" ]; then
exit 1
fi;
if ! etcdir=@ETCDIR@ perl -e 'use File::Copy qw(copy); $etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir=substr($rel, 0, length($rel)-length("/smp_affinity")); if(0) { print "cp $file $etcdir/$rel\n";} if(system("mkdir -p $etcdir/$dir")){ exit 1;} if(!copy($file,"$etcdir/$rel")){ exit 1;} }'; then
if ! perl -e 'use File::Copy qw(copy); $tmpdir="/tmp/mcreboot"; @files = grep { -f } glob "/proc/irq/*/smp_affinity"; foreach $file (@files) { $rel = substr($file, 1); $dir = substr($rel, 0, length($rel) - length("/smp_affinity")); if (system("mkdir -p $tmpdir/$dir")) { exit 1; } if (0) { print "cp $file $tmpdir/$rel\n"; } if (!copy($file,"$tmpdir/$rel")) { exit 1; } }'; then
echo "error: saving /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
error_exit "irqbalance_stopped"
fi;
# Prevent /proc/irq/*/smp_affinity from getting zero after offlining
@ -252,32 +288,45 @@ if [ "${irqbalance_used}" == "yes" ]; then
if ! ncpus=$ncpus smp_affinity_mask=$smp_affinity_mask perl -e '@dirs = grep { -d } glob "/proc/irq/*"; foreach $dir (@dirs) { $hit = 0; $affinity_str = `cat $dir/smp_affinity`; chomp $affinity_str; @int32strs = split /,/, $affinity_str; @int32strs_mask=split /,/, $ENV{'smp_affinity_mask'}; for($i=0;$i <= $#int32strs_mask; $i++) { $int32strs_inv[$i] = sprintf("%08x",hex($int32strs_mask[$i])^0xffffffff); if($i == 0) { $len = int((($ENV{'ncpus'}%32)+3)/4); if($len != 0) { $int32strs_inv[$i] = substr($int32strs_inv[$i], -$len, $len); } } } $inv = join(",", @int32strs_inv); $nint32s = int(($ENV{'ncpus'}+31)/32); for($j = $nint32s - 1; $j >= 0; $j--) { if(hex($int32strs[$nint32s - 1 - $j]) & hex($int32strs_mask[$nint32s - 1 - $j])) { $hit = 1; }} if($hit == 1) { $cmd = "echo $inv > $dir/smp_affinity 2>/dev/null"; system $cmd;}}'; then
echo "error: modifying /proc/irq/*/smp_affinity" >&2
error_exit "mcos_sys_mounted"
error_exit "irqbalance_stopped"
fi
fi
# Set umask so that proc/sys files/directories created by
# mcctrl.ko and mcreboot.sh have appropriate permission bits
umask_dec=$(( 8#${umask_old} & 8#0002 ))
umask 0`printf "%o" ${umask_dec}`
# Load IHK if not loaded
if ! grep -E 'ihk\s' /proc/modules &>/dev/null; then
if ! taskset -c 0 insmod ${KMODDIR}/ihk.ko 2>/dev/null; then
echo "error: loading ihk" >&2
error_exit "irqbalance_stopped"
error_exit "smp_affinity_modified"
fi
fi
# Increase swappiness so that we have better chance to allocate memory for IHK
echo 100 > /proc/sys/vm/swappiness
# Copy modules under /tmp to avoid loading from shared FS
if mkdir -p /tmp/mcos-kmod; then
cp ${KMODDIR}/* /tmp/mcos-kmod/
KMODDIR="/tmp/mcos-kmod/"
fi
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
# Fujitsu drops caches for us in between jobs so don't do it on OFP
if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" == "" ]; then
# Increase swappiness so that we have better chance to allocate memory for IHK
echo 100 > /proc/sys/vm/swappiness
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
# Drop Linux caches to free memory
sync && echo 3 > /proc/sys/vm/drop_caches
sync
# Merge free memory areas into large, physically contigous ones
echo 1 > /proc/sys/vm/compact_memory 2>/dev/null
sync
fi
# Load IHK-SMP if not loaded and reserve CPUs and memory
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then
if ! grep ihk_smp_@ARCH@ /proc/modules &>/dev/null; then
if [ "$ihk_irq" == "" ]; then
for i in `seq 64 255`; do
if [ ! -d /proc/irq/$i ] && [ "`cat /proc/interrupts | grep ":" | awk '{print $1}' | grep -o '[0-9]*' | grep -e '^$i$'`" == "" ]; then
@ -290,30 +339,46 @@ if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then
error_exit "ihk_loaded"
fi
fi
if ! taskset -c 0 insmod ${KMODDIR}/ihk-smp-x86.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core 2>/dev/null; then
echo "error: loading ihk-smp-x86" >&2
if ! taskset -c 0 insmod ${KMODDIR}/ihk-smp-@ARCH@.ko ihk_start_irq=$ihk_irq ihk_ikc_irq_core=$ihk_ikc_irq_core 2>/dev/null; then
echo "error: loading ihk-smp-@ARCH@" >&2
error_exit "ihk_loaded"
fi
# Offline-reonline RAM (special case for OFP SNC-4 mode)
if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
for i in 0 1 2 3; do
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 0 > $f 2>&1 > /dev/null;
done
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 1 > $f 2>&1 > /dev/null;
done
done
for i in 4 5 6 7; do
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 0 > $f 2>&1 > /dev/null;
done
find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
echo 1 > $f 2>&1 > /dev/null;
done
done
fi
# # Offline-reonline RAM (special case for OFP SNC-4 flat mode)
# if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-7" ]; then
# for i in 0 1 2 3; do
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 0 | tee $f 2>/dev/null 1>/dev/null
# done
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 1 | tee $f 2>/dev/null 1>/dev/null
# done
# done
# for i in 4 5 6 7; do
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 0 | tee $f 2>/dev/null 1>/dev/null
# done
# done
# for i in 4 5 6 7; do
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 1 | tee $f 2>/dev/null 1>/dev/null
# done
# done
# fi
#
# # Offline-reonline RAM (special case for OFP Quadrant flat mode)
# if [ "`hostname | grep "c[0-9][0-9][0-9][0-9].ofp"`" != "" ] && [ "`cat /sys/devices/system/node/online`" == "0-1" ]; then
# for i in 1; do
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 0 | tee $f 2>/dev/null 1>/dev/null
# done
# done
# for i in 1; do
# find /sys/devices/system/node/node$i/memory*/ -name "online" | while read f; do
# echo 1 | tee $f 2>/dev/null 1>/dev/null
# done
# done
# fi
if ! ${SBINDIR}/ihkconfig 0 reserve mem ${mem}; then
echo "error: reserving memory" >&2
@ -333,13 +398,20 @@ if ! grep mcctrl /proc/modules &>/dev/null; then
fi
fi
# Check that different versions of binaries/scripts are not mixed
IHK_BUILDID=`${SBINDIR}/ihkconfig 0 get buildid`
if [ "${IHK_BUILDID}" != "${MCK_BUILDID}" ]; then
echo "IHK build-id (${IHK_BUILDID}) didn't match McKernel build-id (${MCK_BUILDID})." >&2
exit 1
fi
# Destroy all LWK instances
if ls /dev/mcos* 1>/dev/null 2>&1; then
for i in /dev/mcos*; do
ind=`echo $i|cut -c10-`;
# Retry when conflicting with ihkmond
nretry=0
until ${SBINDIR}/ihkconfig 0 destroy $ind || [ $nretry -lt 4 ]; do
until ${SBINDIR}/ihkconfig 0 destroy $ind || [ $nretry -ge 4 ]; do
sleep 0.25
nretry=$[ $nretry + 1 ]
done
@ -383,7 +455,7 @@ if ! ${SBINDIR}/ihkosctl 0 load ${KERNDIR}/mckernel.img; then
fi
# Set kernel arguments
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos $turbo dump_level=${DUMP_LEVEL}"; then
if ! ${SBINDIR}/ihkosctl 0 kargs "hidos $turbo $safe_kernel_map $idle_halt dump_level=${DUMP_LEVEL} $extra_kopts $allow_oversubscribe"; then
echo "error: setting kernel arguments" >&2
error_exit "os_created"
fi
@ -409,7 +481,9 @@ if [ "${irqbalance_used}" == "yes" ]; then
banirq=`cat /proc/interrupts| perl -e 'while(<>) { if(/^\s*(\d+).*IHK\-SMP\s*$/) {print $1;}}'`
sed "s/%mask%/$smp_affinity_mask/g" $ETCDIR/irqbalance_mck.in | sed "s/%banirq%/$banirq/g" > /tmp/irqbalance_mck
systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null
disable_irqbalance_mck
if ! systemctl link $ETCDIR/irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "error: linking irqbalance_mck" >&2
error_exit "mcos_sys_mounted"
@ -421,3 +495,8 @@ if [ "${irqbalance_used}" == "yes" ]; then
fi
# echo cpus=$cpus ncpus=$ncpus banirq=$banirq
fi
# Restore umask
umask ${umask_old}
exit 0

View File

@ -18,17 +18,24 @@ mem=""
cpus=""
irqbalance_used=""
disable_irqbalance_mck() {
if [ -f /etc/systemd/system/irqbalance_mck.service ]; then
systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null
# Invalid .service file persists so remove it
rm -f /etc/systemd/system/irqbalance_mck.service
fi
}
# No SMP module? Exit.
if ! grep ihk_smp_x86 /proc/modules &>/dev/null; then exit 0; fi
if ! grep ihk_smp_@ARCH@ /proc/modules &>/dev/null; then exit 0; fi
if [ "`systemctl status irqbalance_mck.service 2> /dev/null |grep -E 'Active: active'`" != "" ]; then
irqbalance_used="yes"
if ! systemctl stop irqbalance_mck.service 2>/dev/null; then
echo "warning: failed to stop irqbalance_mck" >&2
fi
if ! systemctl disable irqbalance_mck.service >/dev/null 2>/dev/null; then
echo "warning: failed to disable irqbalance_mck" >&2
fi
disable_irqbalance_mck
fi
# Destroy all LWK instances
@ -37,7 +44,7 @@ if ls /dev/mcos* 1>/dev/null 2>&1; then
ind=`echo $i|cut -c10-`;
# Retry when conflicting with ihkmond
nretry=0
until ${SBINDIR}/ihkconfig 0 destroy $ind || [ $nretry -lt 4 ]; do
until ${SBINDIR}/ihkconfig 0 destroy $ind || [ $nretry -ge 4 ]; do
sleep 0.25
nretry=$[ $nretry + 1 ]
done
@ -48,6 +55,9 @@ if ls /dev/mcos* 1>/dev/null 2>&1; then
done
fi
# Allow ihkmond to flush kmsg buffer
sleep 2.0
# Query IHK-SMP resources and release them
if ! ${SBINDIR}/ihkconfig 0 query cpu > /dev/null; then
echo "error: querying cpus" >&2
@ -62,17 +72,23 @@ if [ "${cpus}" != "" ]; then
fi
fi
if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
echo "error: querying memory" >&2
exit 1
fi
#if ! ${SBINDIR}/ihkconfig 0 query mem > /dev/null; then
# echo "error: querying memory" >&2
# exit 1
#fi
#
#mem=`${SBINDIR}/ihkconfig 0 query mem`
#if [ "${mem}" != "" ]; then
# if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
# echo "error: releasing memory" >&2
# exit 1
# fi
#fi
mem=`${SBINDIR}/ihkconfig 0 query mem`
if [ "${mem}" != "" ]; then
if ! ${SBINDIR}/ihkconfig 0 release mem $mem > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
# Release all memory
if ! ${SBINDIR}/ihkconfig 0 release mem "all" > /dev/null; then
echo "error: releasing memory" >&2
exit 1
fi
# Remove delegator if loaded
@ -87,9 +103,9 @@ fi
. ${SBINDIR}/mcoverlay-destroy.sh
# Remove SMP module
if grep ihk_smp_x86 /proc/modules &>/dev/null; then
if ! rmmod ihk_smp_x86 2>/dev/null; then
echo "error: removing ihk_smp_x86" >&2
if grep ihk_smp_@ARCH@ /proc/modules &>/dev/null; then
if ! rmmod ihk_smp_@ARCH@ 2>/dev/null; then
echo "error: removing ihk_smp_@ARCH@" >&2
exit 1
fi
fi
@ -110,9 +126,10 @@ fi
# Start irqbalance with the original settings
if [ "${irqbalance_used}" != "" ]; then
if ! etcdir=@ETCDIR@ perl -e '$etcdir=$ENV{'etcdir'}; @files = grep { -f } glob "$etcdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($etcdir)); if(0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
if ! perl -e '$tmpdir="/tmp/mcreboot"; @files = grep { -f } glob "$tmpdir/proc/irq/*/smp_affinity"; foreach $file (@files) { $dest = substr($file, length($tmpdir)); if (0) {print "cp $file $dest\n";} system("cp $file $dest 2>/dev/null"); }'; then
echo "warning: failed to restore /proc/irq/*/smp_affinity" >&2
fi
if [ -e /tmp/mcreboot ]; then rm -rf /tmp/mcreboot; fi
if ! systemctl start irqbalance.service; then
echo "warning: failed to start irqbalance" >&2;
fi

View File

@ -0,0 +1,60 @@
.\" Man page for mpimcexec
.\"
.TH MPIMCEXEC 1 "@MCKERNEL_RELEASE_DATE@" "Version @MCKERNEL_VERSION@" MCKERNEL @MCKERNEL_VERSION@"
.SH NAME
mpimcexec \- run an MPI application on McKernel
.\"
.\" ---------------------------- SYNOPSIS ----------------------------
.SH SYNOPSIS
.B mpimcexec \fR [\fIoptions\fR] \fI<command>\fR
.\" ---------------------------- DESCRIPTION ----------------------------
.SH DESCRIPTION
mpimcexec is a wrapper script for running MPI applications on McKernel.
It internally calls mpiexec to spawn mcexec on compute nodes, which in
turn runs \fI<command>\fR on McKernel. mpimcexec specifies a number of
mcexec arguments that enable high performance execution.
.\" ---------------------------- OPTIONS ----------------------------
.SH OPTIONS
.TP
.B -ppn N, --ppn N, --ranks-per-node N
Specify the number of MPI ranks per node. This argument is required.
.TP
.B -n N, --n N, --ranks N
Specify the number of total MPI ranks.
e.g.,
$ mpimcexec -n 32 -ppn 4 ./a.out
.br
In the above example, 32 MPI processes are invoked
on eight compute nodes each of which has four processes.
.TP
.B --nodes N
Specify the number of compute nodes.
By default, all nodes, specified by "PJM --mpi proc" option, are used.
.TP
.B --env, -env
Pass an additional environment variable
.TP
.B -m N, --numa N
Specify preferred NUMA node.
.TP
.B -h <file name>, ---hostfile <file name>
Specify a host file for MPI.
.TP
.B --help
Show help message.
.PP
.\" ---------------------------- SEE ALSO ----------------------------
.SH SEE ALSO
\fBmcexec\fR (1), \fBmpiexec\fR (1)
.\" ---------------------------- AUTHORS ----------------------------
.SH AUTHORS
Copyright (C) 2018 McKernel Development Team, RIKEN, Japan

147
arch/x86_64/tools/mpimcexec.in Executable file
View File

@ -0,0 +1,147 @@
#!/bin/bash
#
# OFP McKernel MPI wrapper script
# author: Balazs Gerofi <bgerofi@riken.jp>
# Copyright (C) 2018 RIKEN R-CCS
#
prefix="@prefix@"
BINDIR="${prefix}/bin"
if [ "${BASH_VERSINFO[0]}" -lt 4 ]; then
echo "You need at least bash-4.0 to run this script." >&2
exit 1
fi
RANKS=""
NODES=""
PPN=""
MPI_ENV=""
COMMAND=""
NUMA=""
HOSTFILE=""
if [ ! -z "${PJM_PROC_BY_NODE}" ]; then
PPN=${PJM_PROC_BY_NODE}
elif [ ! -z "${MPI_LOCALNRANKS}" ]; then
PPN=${MPI_LOCALNRANKS}
fi
help_exit() {
echo ""
echo "Spawn an McKernel MPI job on Oakforest-PACS."
echo "usage: `basename $0` -ppn ranks_per_node [--nodes nodes] [-n ranks] [--env additional_environment]... command"
echo ""
echo " -ppn | --ppn | --ranks-per-node Number of MPI ranks per node (required)"
echo " -n | --n | --ranks Total number of MPI ranks in the job"
echo " --nodes Number of nodes to be used"
echo " --env | -env Pass an additional environment variable"
echo " -m | --numa Preferred NUMA node(s)"
echo " -h | --hostfile Host file for MPI"
echo " --help Show help message"
exit 1
}
# Parse options
while true; do
case $1 in
-ppn | --ppn | --ranks-per-node )
if [ $# -lt 2 ]; then
echo "error: needs an interger value for -ppn, --ppn, or --ranks-per-node option"
help_exit
fi
PPN=$2
shift 2
;;
-n | --n | --ranks )
if [ $# -lt 2 ]; then
echo "error: needs an interger value for -n, --n, or --ranks option"
help_exit
fi
RANKS=$2
shift 2
;;
-m | --numa )
if [ $# -lt 2 ]; then
echo "error: needs an interger value for -m or --numa option"
help_exit
fi
NUMA="-m $2"
shift 2
;;
--nodes )
if [ $# -lt 2 ]; then
echo "error: needs an interger value for --nodes option"
help_exit
fi
NODES=$2
shift 2
;;
--env | -env )
if [ $# -lt 2 ]; then
echo "error: needs an environment variable name for -env or --env option"
help_exit
fi
if [ -z "`echo $2 | grep I_MPI_PIN`" ]; then
MPI_ENV=`echo "${MPI_ENV} -env $2" | xargs`
fi
shift 2
;;
-h | --hostfile )
if [ $# -lt 2 ]; then
echo "error: needs a file name for -h or --hostfile option"
help_exit
fi
HOSTFILE="-hostfile $2"
shift 2
;;
--help )
help_exit
;;
* )
COMMAND=$@
break
;;
esac
done
if [ -z ${PPN} ]; then
echo "error: please specify the number of ranks per node"
help_exit
fi
# Unless explicitly specified, use Fujitsu inherited value
if [ -z ${NODES} ]; then
NODES=${PJM_VNODES}
fi
if [ -z ${RANKS} ] && [ -z ${NODES} ]; then
echo "error: please specify the total number of ranks or the number of nodes"
help_exit
fi
if [ "x${COMMAND}" = "x" ]; then
echo "error: please specify command"
help_exit
fi
# Calculate total job size if not specified
if [ -z ${RANKS} ]; then
let RANKS=(${PPN}*${NODES})
fi
# Support direct SSH when not executed from Fujitsu job system
if [ -z ${PJM_VNODES} ]; then
HOSTFILE="-launcher-exec ssh ${HOSTFILE}"
fi
export I_MPI_PIN=off
export PSM2_RCVTHREAD=0
export HFI_NO_CPUAFFINITY=1
export I_MPI_COLL_INTRANODE_SHM_THRESHOLD=4194304
export PSM2_MQ_RNDV_HFI_WINDOW=4194304
export PSM2_MQ_EAGER_SDMA_SZ=65536
export PSM2_MQ_RNDV_HFI_THRESH=200000
mpirun ${HOSTFILE} -n ${RANKS} -ppn ${PPN} ${MPI_ENV} ${BINDIR}/mcexec -n ${PPN} ${NUMA} --enable-hfi1 --mpol-threshold=1M --stack-premap=4M,4G --extend-heap-by=8M --disable-sched-yield --mpol-shm-premap ${COMMAND}

View File

@ -3,13 +3,19 @@
/* Path of install directory for binary */
#undef BINDIR
/* IHK build-id to confirm IHK and McKernel built at the same time are used */
#undef BUILDID
/* whether mcoverlayfs is enabled */
#undef ENABLE_MCOVERLAYFS
/* whether memdump feature is enabled */
#undef ENABLE_MEMDUMP
/* whether mcoverlayfs is enabled */
/* whether perf is enabled */
#undef ENABLE_PERF
/* whether qlmpi is enabled */
#undef ENABLE_QLMPI
/* whether rusage is enabled */

114
configure vendored
View File

@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.69 for mckernel 0.9.0.
# Generated by GNU Autoconf 2.69 for mckernel 1.5.1-knl+hfi.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@ -577,8 +577,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='mckernel'
PACKAGE_TARNAME='mckernel'
PACKAGE_VERSION='0.9.0'
PACKAGE_STRING='mckernel 0.9.0'
PACKAGE_VERSION='1.5.1-knl+hfi'
PACKAGE_STRING='mckernel 1.5.1-knl+hfi'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@ -645,6 +645,7 @@ TARGET
UNAME_R
KDIR
ARCH
BUILDID
XCC
FGREP
EGREP
@ -708,6 +709,7 @@ enable_dcfa
enable_memdump
enable_mcoverlayfs
enable_rusage
enable_perf
enable_qlmpi
with_uname_r
'
@ -1260,7 +1262,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures mckernel 0.9.0 to adapt to many kinds of systems.
\`configure' configures mckernel 1.5.1-knl+hfi to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1321,7 +1323,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of mckernel 0.9.0:";;
short | recursive ) echo "Configuration of mckernel 1.5.1-knl+hfi:";;
esac
cat <<\_ACEOF
@ -1333,6 +1335,7 @@ Optional Features:
--enable-memdump enable dumping memory and analyzing a dump
--enable-mcoverlayfs enable mcoverlayfs implementation
--enable-rusage enable rusage implementation
--enable-perf enable perf_event implementation
--enable-qlmpi enable qlmpi implementation
Optional Packages:
@ -1428,7 +1431,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
mckernel configure 0.9.0
mckernel configure 1.5.1-knl+hfi
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@ -1726,7 +1729,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by mckernel $as_me 0.9.0, which was
It was created by mckernel $as_me 1.5.1-knl+hfi, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@ -2079,12 +2082,12 @@ ac_compiler_gnu=$ac_cv_c_compiler_gnu
IHK_VERSION=0.9.0
MCKERNEL_VERSION=0.9.0
DCFA_VERSION=0.9.0
IHK_RELEASE_DATE=2013-11-18
MCKERNEL_RELEASE_DATE=2013-11-18
DCFA_RELEASE_DATE=2013-11-18
IHK_VERSION=1.5.1-knl+hfi
MCKERNEL_VERSION=1.5.1-knl+hfi
DCFA_VERSION=DCFA_VERSION_m4
IHK_RELEASE_DATE=2019-05-14
MCKERNEL_RELEASE_DATE=2019-05-14
DCFA_RELEASE_DATE=DCFA_RELEASE_DATE_m4
@ -3568,6 +3571,14 @@ else
fi
# Check whether --enable-perf was given.
if test "${enable_perf+set}" = set; then :
enableval=$enable_perf; ENABLE_PERF=$enableval
else
ENABLE_PERF=yes
fi
# Check whether --enable-qlmpi was given.
if test "${enable_qlmpi+set}" = set; then :
enableval=$enable_qlmpi; ENABLE_QLMPI=$enableval
@ -4275,7 +4286,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/man"
MANDIR="$prefix/share/man"
fi
;;
builtin-mic)
@ -4292,7 +4303,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/attached/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/attached/man"
MANDIR="$prefix/share/man"
fi
;;
builtin-x86)
@ -4309,7 +4320,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/attached/man"
MANDIR="$prefix/share/man"
fi
;;
smp-x86)
@ -4341,7 +4352,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-x86/man"
MANDIR="$prefix/share/man"
fi
;;
smp-arm64)
@ -4366,11 +4377,14 @@ case $WITH_TARGET in
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$INCLUDEDIR" = X; then
INCLUDEDIR="$prefix/include"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-arm64/man"
MANDIR="$prefix/share/man"
fi
;;
*)
@ -4961,6 +4975,17 @@ else
$as_echo "$as_me: rusage is disabled" >&6;}
fi
if test "x$ENABLE_PERF" = "xyes" ; then
$as_echo "#define ENABLE_PERF 1" >>confdefs.h
{ $as_echo "$as_me:${as_lineno-$LINENO}: perf is enabled" >&5
$as_echo "$as_me: perf is enabled" >&6;}
else
{ $as_echo "$as_me:${as_lineno-$LINENO}: perf is disabled" >&5
$as_echo "$as_me: perf is disabled" >&6;}
fi
if test "x$MCKERNEL_INCDIR" != "x" ; then
cat >>confdefs.h <<_ACEOF
@ -4988,6 +5013,20 @@ cat >>confdefs.h <<_ACEOF
_ACEOF
ABS_SRCDIR=$( cd $( dirname $0 ); pwd )
IHK_ABS_SRCDIR=${ABS_SRCDIR}/../ihk
BUILDID=$( cd $IHK_ABS_SRCDIR; if [ ! -d .git ]; then echo $IHK_VERSION; else bash -c 'git rev-list -1 HEAD | cut -c1-8'; fi )
{ $as_echo "$as_me:${as_lineno-$LINENO}: BUILDID=$BUILDID" >&5
$as_echo "$as_me: BUILDID=$BUILDID" >&6;}
if test "x$BUILDID" != "x" ; then
cat >>confdefs.h <<_ACEOF
#define BUILDID "$BUILDID"
_ACEOF
fi
@ -5021,9 +5060,14 @@ ac_config_headers="$ac_config_headers config.h"
# POSTK_DEBUG_ARCH_DEP_37
# AC_CONFIG_FILES arch dependfiles separate
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/user/mcexec.1:executer/user/mcexec.1in executer/user/vmcore2mckdump executer/user/arch/$ARCH/Makefile executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/$ARCH/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile executer/include/qlmpilib.h kernel/Makefile kernel/Makefile.build kernel/include/swapfmt.h arch/x86/tools/mcreboot-attached-mic.sh arch/x86/tools/mcshutdown-attached-mic.sh arch/x86/tools/mcreboot-builtin-x86.sh arch/x86/tools/mcreboot-smp-x86.sh arch/x86/tools/mcstop+release-smp-x86.sh arch/x86/tools/mcoverlay-destroy-smp-x86.sh arch/x86/tools/mcoverlay-create-smp-x86.sh arch/x86/tools/eclair-dump-backtrace.exp arch/x86/tools/mcshutdown-builtin-x86.sh arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in arch/x86/tools/irqbalance_mck.service arch/x86/tools/irqbalance_mck.in"
ac_config_files="$ac_config_files Makefile executer/user/Makefile executer/user/mcexec.1:executer/user/mcexec.1in executer/user/vmcore2mckdump executer/user/arch/$ARCH/Makefile executer/user/arch/x86_64/Makefile executer/kernel/mcctrl/Makefile executer/kernel/mcctrl/arch/$ARCH/Makefile executer/kernel/mcoverlayfs/Makefile executer/kernel/mcoverlayfs/linux-3.10.0-327.36.1.el7/Makefile executer/kernel/mcoverlayfs/linux-4.0.9/Makefile executer/kernel/mcoverlayfs/linux-4.6.7/Makefile executer/include/qlmpilib.h kernel/Makefile kernel/Makefile.build kernel/include/swapfmt.h arch/x86_64/tools/mcreboot-attached-mic.sh arch/x86_64/tools/mcshutdown-attached-mic.sh arch/x86_64/tools/mcreboot-builtin-x86.sh arch/x86_64/tools/mcreboot-smp-x86.sh arch/x86_64/tools/mcstop+release-smp-x86.sh arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh arch/x86_64/tools/mcoverlay-create-smp-x86.sh arch/x86_64/tools/eclair-dump-backtrace.exp arch/x86_64/tools/mcshutdown-builtin-x86.sh arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in arch/x86_64/tools/mpimcexec arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in arch/x86_64/tools/irqbalance_mck.service arch/x86_64/tools/irqbalance_mck.in tools/mcstat/Makefile"
if test "$TARGET" = "smp-x86"; then
ac_config_files="$ac_config_files arch/x86_64/kernel/Makefile.arch"
fi
if test "$TARGET" = "smp-arm64"; then
ac_config_files="$ac_config_files kernel/config/config.smp-arm64 arch/arm64/kernel/vdso/Makefile arch/arm64/kernel/Makefile.arch"
@ -5541,7 +5585,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by mckernel $as_me 0.9.0, which was
This file was extended by mckernel $as_me 1.5.1-knl+hfi, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@ -5603,7 +5647,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
mckernel config.status 0.9.0
mckernel config.status 1.5.1-knl+hfi
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
@ -5741,18 +5785,22 @@ do
"kernel/Makefile") CONFIG_FILES="$CONFIG_FILES kernel/Makefile" ;;
"kernel/Makefile.build") CONFIG_FILES="$CONFIG_FILES kernel/Makefile.build" ;;
"kernel/include/swapfmt.h") CONFIG_FILES="$CONFIG_FILES kernel/include/swapfmt.h" ;;
"arch/x86/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-attached-mic.sh" ;;
"arch/x86/tools/mcshutdown-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-attached-mic.sh" ;;
"arch/x86/tools/mcreboot-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-builtin-x86.sh" ;;
"arch/x86/tools/mcreboot-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot-smp-x86.sh" ;;
"arch/x86/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcstop+release-smp-x86.sh" ;;
"arch/x86/tools/mcoverlay-destroy-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcoverlay-destroy-smp-x86.sh" ;;
"arch/x86/tools/mcoverlay-create-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcoverlay-create-smp-x86.sh" ;;
"arch/x86/tools/eclair-dump-backtrace.exp") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/eclair-dump-backtrace.exp" ;;
"arch/x86/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcshutdown-builtin-x86.sh" ;;
"arch/x86/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in" ;;
"arch/x86/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.service" ;;
"arch/x86/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86/tools/irqbalance_mck.in" ;;
"arch/x86_64/tools/mcreboot-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcreboot-attached-mic.sh" ;;
"arch/x86_64/tools/mcshutdown-attached-mic.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcshutdown-attached-mic.sh" ;;
"arch/x86_64/tools/mcreboot-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcreboot-builtin-x86.sh" ;;
"arch/x86_64/tools/mcreboot-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcreboot-smp-x86.sh" ;;
"arch/x86_64/tools/mcstop+release-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcstop+release-smp-x86.sh" ;;
"arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh" ;;
"arch/x86_64/tools/mcoverlay-create-smp-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcoverlay-create-smp-x86.sh" ;;
"arch/x86_64/tools/eclair-dump-backtrace.exp") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/eclair-dump-backtrace.exp" ;;
"arch/x86_64/tools/mcshutdown-builtin-x86.sh") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcshutdown-builtin-x86.sh" ;;
"arch/x86_64/tools/mcreboot.1") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in" ;;
"arch/x86_64/tools/mpimcexec") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mpimcexec" ;;
"arch/x86_64/tools/mpimcexec.1") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in" ;;
"arch/x86_64/tools/irqbalance_mck.service") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/irqbalance_mck.service" ;;
"arch/x86_64/tools/irqbalance_mck.in") CONFIG_FILES="$CONFIG_FILES arch/x86_64/tools/irqbalance_mck.in" ;;
"tools/mcstat/Makefile") CONFIG_FILES="$CONFIG_FILES tools/mcstat/Makefile" ;;
"arch/x86_64/kernel/Makefile.arch") CONFIG_FILES="$CONFIG_FILES arch/x86_64/kernel/Makefile.arch" ;;
"kernel/config/config.smp-arm64") CONFIG_FILES="$CONFIG_FILES kernel/config/config.smp-arm64" ;;
"arch/arm64/kernel/vdso/Makefile") CONFIG_FILES="$CONFIG_FILES arch/arm64/kernel/vdso/Makefile" ;;
"arch/arm64/kernel/Makefile.arch") CONFIG_FILES="$CONFIG_FILES arch/arm64/kernel/Makefile.arch" ;;

View File

@ -1,11 +1,9 @@
# configure.ac COPYRIGHT FUJITSU LIMITED 2015-2016
AC_PREREQ(2.63)
m4_define([IHK_VERSION_m4],[0.9.0])dnl
m4_define([MCKERNEL_VERSION_m4],[0.9.0])dnl
m4_define([DCFA_VERSION_m4],[0.9.0])dnl
m4_define([IHK_RELEASE_DATE_m4],[2013-11-18])dnl
m4_define([MCKERNEL_RELEASE_DATE_m4],[2013-11-18])dnl
m4_define([DCFA_RELEASE_DATE_m4],[2013-11-18])dnl
m4_define([IHK_VERSION_m4],[1.5.1-knl+hfi])dnl
m4_define([MCKERNEL_VERSION_m4],[1.5.1-knl+hfi])dnl
m4_define([IHK_RELEASE_DATE_m4],[2019-05-14])dnl
m4_define([MCKERNEL_RELEASE_DATE_m4],[2019-05-14])dnl
AC_INIT([mckernel], MCKERNEL_VERSION_m4)
@ -134,6 +132,12 @@ AC_ARG_ENABLE([rusage],
[ENABLE_RUSAGE=$enableval],
[ENABLE_RUSAGE=yes])
AC_ARG_ENABLE([perf],
AC_HELP_STRING([--enable-perf],
[enable perf_event implementation]),
[ENABLE_PERF=$enableval],
[ENABLE_PERF=yes])
AC_ARG_ENABLE([qlmpi],
AC_HELP_STRING([--enable-qlmpi],
[enable qlmpi implementation]),
@ -225,7 +229,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/man"
MANDIR="$prefix/share/man"
fi
;;
builtin-mic)
@ -242,7 +246,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/attached/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/attached/man"
MANDIR="$prefix/share/man"
fi
;;
builtin-x86)
@ -259,7 +263,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/attached/man"
MANDIR="$prefix/share/man"
fi
;;
smp-x86)
@ -291,7 +295,7 @@ case $WITH_TARGET in
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-x86/man"
MANDIR="$prefix/share/man"
fi
;;
smp-arm64)
@ -316,11 +320,14 @@ case $WITH_TARGET in
if test "X$ETCDIR" = X; then
ETCDIR="$prefix/etc"
fi
if test "X$INCLUDEDIR" = X; then
INCLUDEDIR="$prefix/include"
fi
if test "X$KMODDIR" = X; then
KMODDIR="$prefix/kmod"
fi
if test "X$MANDIR" = X; then
MANDIR="$prefix/smp-arm64/man"
MANDIR="$prefix/share/man"
fi
;;
*)
@ -451,7 +458,7 @@ else
fi
if test "x$ENABLE_QLMPI" = "xyes" ; then
AC_DEFINE([ENABLE_QLMPI],[1],[whether mcoverlayfs is enabled])
AC_DEFINE([ENABLE_QLMPI],[1],[whether qlmpi is enabled])
AC_MSG_NOTICE([qlmpi is enabled])
else
AC_MSG_NOTICE([qlmpi is disabled])
@ -475,6 +482,13 @@ else
AC_MSG_NOTICE([rusage is disabled])
fi
if test "x$ENABLE_PERF" = "xyes" ; then
AC_DEFINE([ENABLE_PERF],[1],[whether perf is enabled])
AC_MSG_NOTICE([perf is enabled])
else
AC_MSG_NOTICE([perf is disabled])
fi
if test "x$MCKERNEL_INCDIR" != "x" ; then
AC_DEFINE_UNQUOTED(MCKERNEL_INCDIR,"$MCKERNEL_INCDIR",[McKernel specific headers])
fi
@ -486,6 +500,15 @@ fi
AC_DEFINE_UNQUOTED(BINDIR,"$BINDIR",[Path of install directory for binary])
AC_DEFINE_UNQUOTED(SBINDIR,"$SBINDIR",[Path of install directory for system binary])
ABS_SRCDIR=$( cd $( dirname $0 ); pwd )
IHK_ABS_SRCDIR=${ABS_SRCDIR}/../ihk
BUILDID=$( cd $IHK_ABS_SRCDIR; if @<:@ ! -d .git @:>@; then echo $IHK_VERSION; else bash -c 'git rev-list -1 HEAD | cut -c1-8'; fi )
AC_MSG_NOTICE([BUILDID=$BUILDID])
if test "x$BUILDID" != "x" ; then
AC_DEFINE_UNQUOTED(BUILDID,"$BUILDID",[IHK build-id to confirm IHK and McKernel built at the same time are used])
fi
AC_SUBST(BUILDID)
AC_SUBST(CC)
AC_SUBST(XCC)
AC_SUBST(ARCH)
@ -535,20 +558,29 @@ AC_CONFIG_FILES([
kernel/Makefile
kernel/Makefile.build
kernel/include/swapfmt.h
arch/x86/tools/mcreboot-attached-mic.sh
arch/x86/tools/mcshutdown-attached-mic.sh
arch/x86/tools/mcreboot-builtin-x86.sh
arch/x86/tools/mcreboot-smp-x86.sh
arch/x86/tools/mcstop+release-smp-x86.sh
arch/x86/tools/mcoverlay-destroy-smp-x86.sh
arch/x86/tools/mcoverlay-create-smp-x86.sh
arch/x86/tools/eclair-dump-backtrace.exp
arch/x86/tools/mcshutdown-builtin-x86.sh
arch/x86/tools/mcreboot.1:arch/x86/tools/mcreboot.1in
arch/x86/tools/irqbalance_mck.service
arch/x86/tools/irqbalance_mck.in
arch/x86_64/tools/mcreboot-attached-mic.sh
arch/x86_64/tools/mcshutdown-attached-mic.sh
arch/x86_64/tools/mcreboot-builtin-x86.sh
arch/x86_64/tools/mcreboot-smp-x86.sh
arch/x86_64/tools/mcstop+release-smp-x86.sh
arch/x86_64/tools/mcoverlay-destroy-smp-x86.sh
arch/x86_64/tools/mcoverlay-create-smp-x86.sh
arch/x86_64/tools/eclair-dump-backtrace.exp
arch/x86_64/tools/mcshutdown-builtin-x86.sh
arch/x86_64/tools/mcreboot.1:arch/x86_64/tools/mcreboot.1in
arch/x86_64/tools/mpimcexec
arch/x86_64/tools/mpimcexec.1:arch/x86_64/tools/mpimcexec.1in
arch/x86_64/tools/irqbalance_mck.service
arch/x86_64/tools/irqbalance_mck.in
tools/mcstat/Makefile
])
if test "$TARGET" = "smp-x86"; then
AC_CONFIG_FILES([
arch/x86_64/kernel/Makefile.arch
])
fi
if test "$TARGET" = "smp-arm64"; then
AC_CONFIG_FILES([
kernel/config/config.smp-arm64

View File

@ -5,6 +5,10 @@
#define IHK_MAX_NUM_NUMA_NODES 1024
#define IHK_MAX_NUM_CPUS 1024
#define IHK_OS_PGSIZE_4KB 0
#define IHK_OS_PGSIZE_2MB 1
#define IHK_OS_PGSIZE_1GB 2
struct mckernel_rusage {
unsigned long memory_stat_rss[IHK_MAX_NUM_PGSIZES];
unsigned long memory_stat_mapped_file[IHK_MAX_NUM_PGSIZES];

View File

@ -91,6 +91,7 @@ struct program_image_section {
struct get_cpu_set_arg {
int nr_processes;
int *process_rank;
void *cpu_set;
size_t cpu_set_size; // Size in bytes
int *target_core;
@ -109,6 +110,8 @@ typedef unsigned long __cpu_set_unit;
#define MPOL_NO_BSS 0x04
#define MPOL_SHM_PREMAP 0x08
#define MCEXEC_HFI1 0x01
struct program_load_desc {
int num_sections;
int status;
@ -137,10 +140,14 @@ struct program_load_desc {
unsigned long envs_len;
struct rlimit rlimit[MCK_RLIM_MAX];
unsigned long interp_align;
unsigned long mcexec_flags;
unsigned long mpol_flags;
unsigned long mpol_threshold;
unsigned long heap_extension;
long stack_premap;
unsigned long mpol_bind_mask;
int nr_processes;
int process_rank;
char shell_path[SHELL_PATH_MAX_LEN];
__cpu_set_unit cpu_set[PLD_CPU_SET_SIZE];
int profile;
@ -187,6 +194,7 @@ struct syscall_response {
long ret;
unsigned long fault_address;
unsigned long fault_reason;
void *private_data;
};
struct syscall_ret_desc {

View File

@ -9,13 +9,22 @@ IHK_BASE=$(src)/../../../../ihk
obj-m += mcctrl.o
# POSTK_DEBUG_ARCH_DEP_1, arch depend "-mcmodel"
# POSTK_DEBUG_ARCH_DEP_83, arch depend translate_rva_to_rpa() move
ifeq ($(ARCH), arm64)
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../include -I$(src)/arch/$(ARCH)/include -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@
else
ccflags-y := -I$(IHK_BASE)/linux/include -I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) -I$(IHK_BASE)/ikc/include -I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) -I$(IHK_BASE)/include -I$(IHK_BASE)/include/arch/$(ARCH) -I$(src)/../../../kernel/include -I$(src)/../../include -mcmodel=kernel -mno-red-zone -DMCEXEC_PATH=\"$(BINDIR)/mcexec\" -I@abs_builddir@ -I@abs_builddir@/../../../
endif
ccflags-y := -I$(IHK_BASE)/linux/include \
-I$(IHK_BASE)/linux/include/ihk/arch/$(ARCH) \
-I$(IHK_BASE)/ikc/include \
-I$(IHK_BASE)/ikc/include/ikc/arch/$(ARCH) \
-I$(IHK_BASE)/include \
-I$(IHK_BASE)/include/arch/$(ARCH) \
-I$(src)/../../include \
-I$(src)/arch/$(ARCH)/include \
-I@abs_builddir@ \
-I@abs_builddir@/../../../ \
-I$(src)/../../../kernel/include \
-DMCEXEC_PATH=\"$(BINDIR)/mcexec\"
# depending arch
include @abs_builddir@/arch/$(ARCH)/Makefile
mcctrl-y := driver.o control.o ikc.o syscall.o procfs.o binfmt_mcexec.o
mcctrl-y += sysfs.o sysfs_files.o arch/$(ARCH)/archdeps.o

View File

@ -1 +1 @@
# dummy file
ccflags-y += -mno-red-zone -mcmodel=kernel

View File

@ -327,6 +327,14 @@ int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
pgsize = 1UL << offsh;
rpa = pt[ix] & ((1UL << 52) - 1) & ~(pgsize - 1);
rpa |= rva & (pgsize - 1);
/* For GB pages, just report regular 2MB page */
if (offsh == 30) {
pgsize = 1UL << 21;
dprintk("%s: GB page translated 0x%lx -> 0x%lx, pgsize: %lu\n",
__FUNCTION__, rva, rpa, pgsize);
}
ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE);
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE);
error = 0;

View File

@ -190,7 +190,11 @@ static long mcexec_prepare_image(ihk_os_t os,
pdesc->status = 0;
mb();
mcctrl_ikc_send(os, pdesc->cpu, &isp);
ret = mcctrl_ikc_send(os, pdesc->cpu, &isp);
if(ret < 0) {
printk("%s: ERROR mcctrl_ikc_send: %d\n", __FUNCTION__, ret);
goto put_and_free_out;
}
ret = wait_event_interruptible(ppd->wq_prepare, pdesc->status);
if (ret < 0) {
@ -363,7 +367,7 @@ static long mcexec_debug_log(ihk_os_t os, unsigned long arg)
}
int mcexec_close_exec(ihk_os_t os);
int mcexec_destroy_per_process_data(ihk_os_t os);
int mcexec_destroy_per_process_data(ihk_os_t os, int pid);
static void release_handler(ihk_os_t os, void *param)
{
@ -383,7 +387,7 @@ static void release_handler(ihk_os_t os, void *param)
mcexec_close_exec(os);
mcexec_destroy_per_process_data(os);
mcexec_destroy_per_process_data(os, info->pid);
memset(&isp, '\0', sizeof isp);
isp.msg = SCD_MSG_CLEANUP_PROCESS;
@ -431,6 +435,7 @@ static long mcexec_start_image(ihk_os_t os,
struct mcctrl_channel *c;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcos_handler_info *info;
int ret = 0;
desc = kmalloc(sizeof(*desc), GFP_KERNEL);
if (!desc) {
@ -441,17 +446,18 @@ static long mcexec_start_image(ihk_os_t os,
if (copy_from_user(desc, udesc,
sizeof(struct program_load_desc))) {
kfree(desc);
return -EFAULT;
ret = -EFAULT;
goto out;
}
info = new_mcos_handler_info(os, file);
#ifdef POSTK_DEBUG_TEMP_FIX_64 /* host process is SIGKILLed fix. */
if (info == NULL) {
kfree(desc);
return -ENOMEM;
ret = -ENOMEM;
goto out;
}
#endif /* POSTK_DEBUG_TEMP_FIX_64 */
info->pid = desc->pid;
info->cpu = desc->cpu;
ihk_os_register_release_handler(file, release_handler, info);
@ -467,10 +473,14 @@ static long mcexec_start_image(ihk_os_t os,
isp.ref = desc->cpu;
isp.arg = desc->rprocess;
mcctrl_ikc_send(os, desc->cpu, &isp);
ret = mcctrl_ikc_send(os, desc->cpu, &isp);
if (ret < 0) {
printk("%s: error: sending IKC msg\n", __FUNCTION__);
}
out:
kfree(desc);
return 0;
return ret;
}
static DECLARE_WAIT_QUEUE_HEAD(signalq);
@ -628,6 +638,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
pli->task = current;
pli->ready = 0;
pli->timeout = 0;
init_waitqueue_head(&pli->pli_wq);
pli_next = NULL;
@ -681,6 +692,7 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
wake_up_interruptible(&pli_next->pli_wq);
/* Reset process counter */
pe->nr_processes_left = pe->nr_processes;
pe->process_rank = 0;
}
/* Wait for the rest if not the last or if the last but
@ -689,11 +701,50 @@ static long mcexec_get_cpuset(ihk_os_t os, unsigned long arg)
dprintk("%s: pid: %d, waiting in list\n",
__FUNCTION__, task_tgid_vnr(current));
mutex_unlock(&pe->lock);
ret = wait_event_interruptible(pli->pli_wq, pli->ready);
/* Timeout period: 10 secs + (#procs * 0.1sec) */
ret = wait_event_interruptible_timeout(pli->pli_wq,
pli->ready,
msecs_to_jiffies(10000 + req.nr_processes * 100));
mutex_lock(&pe->lock);
if (ret != 0) {
/* First timeout task? Wake up everyone else,
* but tell them we timed out */
if (ret == 0) {
printk("%s: error: pid: %d, timed out, waking everyone\n",
__FUNCTION__, task_tgid_vnr(current));
while (!list_empty(&pe->pli_list)) {
pli_next = list_first_entry(&pe->pli_list,
struct process_list_item, list);
list_del(&pli_next->list);
pli_next->ready = 1;
pli_next->timeout = 1;
wake_up_interruptible(&pli_next->pli_wq);
}
/* Reset process counter to start state */
pe->nr_processes = -1;
ret = -ETIMEDOUT;
goto put_and_unlock_out;
}
/* Interrupted or woken up by someone else due to time out? */
if (ret < 0 || pli->timeout) {
if (ret > 0) {
printk("%s: error: pid: %d, job startup timed out\n",
__FUNCTION__, task_tgid_vnr(current));
ret = -ETIMEDOUT;
}
goto put_and_unlock_out;
}
/* Incorrect wakeup state? */
if (!pli->ready) {
printk("%s: error: pid: %d, not ready but woken?\n",
__FUNCTION__, task_tgid_vnr(current));
ret = -EINVAL;
goto put_and_unlock_out;
}
dprintk("%s: pid: %d, woken up\n",
__FUNCTION__, task_tgid_vnr(current));
}
@ -873,6 +924,15 @@ next_cpu:
goto put_and_unlock_out;
}
/* Copy rank */
if (copy_to_user(req.process_rank, &pe->process_rank,
sizeof(int))) {
printk("%s: error copying process rank to user\n",
__FUNCTION__);
ret = -EINVAL;
goto put_and_unlock_out;
}
/* mcexec NUMA to bind to */
mcexec_linux_numa = cpu_to_node(mckernel_cpu_2_linux_cpu(udp, cpu));
if (copy_to_user(req.mcexec_linux_numa, &mcexec_linux_numa,
@ -920,6 +980,7 @@ next_cpu:
}
/* Otherwise wake up next process in list */
else {
++pe->process_rank;
pli_next = list_first_entry(&pe->pli_list,
struct process_list_item, list);
list_del(&pli_next->list);
@ -1012,7 +1073,6 @@ out:
return ret;
}
/* NOTE: per-process data is refcounted.
* For every get call the user should call put. */
struct mcctrl_per_proc_data *mcctrl_get_per_proc_data(
@ -1142,7 +1202,7 @@ int mcexec_syscall(struct mcctrl_usrdata *ud, struct ikc_scd_packet *packet)
return -1;
}
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %d\n",
dprintk("%s: (packet_handler) rtid: %d, ttid: %d, sys nr: %lu\n",
__FUNCTION__,
packet->req.rtid,
packet->req.ttid,
@ -1197,8 +1257,8 @@ retry_alloc:
wqhln->packet = packet;
wqhln->req = 1;
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
wake_up(&wqhln->wq_syscall);
ihk_ikc_spinlock_unlock(&ppd->wq_list_lock, flags);
mcctrl_put_per_proc_data(ppd);
@ -1312,7 +1372,7 @@ retry_alloc:
}
packet->req.valid = 0; /* ack */
dprintk("%s: system call: %d, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
dprintk("%s: system call: %lu, args[0]: %lu, args[1]: %lu, args[2]: %lu, "
"args[3]: %lu, args[4]: %lu, args[5]: %lu\n",
__FUNCTION__,
packet->req.number,
@ -1340,7 +1400,6 @@ retry_alloc:
goto put_ppd_out;
}
#ifdef POSTK_DEBUG_ARCH_DEP_46 /* user area direct access fix. */
if (copy_to_user(&req->cpu, &packet->ref, sizeof(req->cpu))) {
if (mcctrl_delete_per_thread_data(ppd, current) < 0) {
kprintf("%s: error deleting per-thread data\n", __FUNCTION__);
@ -1348,9 +1407,6 @@ retry_alloc:
ret = -EINVAL;
goto put_ppd_out;
}
#else /* POSTK_DEBUG_ARCH_DEP_46 */
req->cpu = packet->ref;
#endif /* POSTK_DEBUG_ARCH_DEP_46 */
ret = 0;
goto put_ppd_out;
@ -1441,7 +1497,7 @@ long mcexec_load_syscall(ihk_os_t os, struct syscall_load_desc *__user arg)
rpm = ihk_device_map_virtual(ihk_os_to_dev(os), phys, desc.size, NULL, 0);
#endif
dprintk("mcexec_load_syscall: %s (desc.size: %d)\n", rpm, desc.size);
dprintk("mcexec_load_syscall: %p (desc.size: %lu)\n", rpm, desc.size);
if (copy_to_user((void *__user)desc.dest, rpm, desc.size)) {
return -EFAULT;
@ -1676,12 +1732,12 @@ int mcexec_create_per_process_data(ihk_os_t os)
return 0;
}
int mcexec_destroy_per_process_data(ihk_os_t os)
int mcexec_destroy_per_process_data(ihk_os_t os, int pid)
{
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
struct mcctrl_per_proc_data *ppd = NULL;
ppd = mcctrl_get_per_proc_data(usrdata, task_tgid_vnr(current));
ppd = mcctrl_get_per_proc_data(usrdata, pid);
if (ppd) {
/* One for the reference and one for deallocation.
@ -2427,7 +2483,9 @@ mcexec_terminate_thread(ihk_os_t os, unsigned long *param, struct file *file)
mcctrl_delete_per_thread_data(ppd, tsk);
__return_syscall(usrdata->os, packet, param[2], tid);
ihk_ikc_release_packet((struct ihk_ikc_free_packet *)packet,
(usrdata->channels + packet->ref)->c);
(usrdata->ikc2linux[smp_processor_id()] ?
usrdata->ikc2linux[smp_processor_id()] :
usrdata->ikc2linux[0]));
err:
if(ppd)
mcctrl_put_per_proc_data(ppd);

View File

@ -27,6 +27,7 @@
#include <linux/miscdevice.h>
#include <linux/slab.h>
#include <linux/device.h>
#include <linux/delay.h>
#include "mcctrl.h"
#include <ihk/ihk_host_user.h>
@ -169,6 +170,14 @@ error_cleanup_channels:
int mcctrl_os_shutdown_notifier(int os_index)
{
if (os[os_index]) {
/* Wait for os running */
if (ihk_os_wait_for_status(os[os_index], IHK_OS_STATUS_RUNNING, 0, 200) != 0) {
printk("IHK: OS does not become RUNNING in shutdown. Force shutdown.\n");
/* send nmi to force shutdown */
ihk_os_send_nmi(os[os_index], 3);
mdelay(200);
}
sysfsm_cleanup(os[os_index]);
free_topology_info(os[os_index]);
ihk_os_unregister_user_call_handlers(os[os_index], mcctrl_uc + os_index);

View File

@ -304,6 +304,7 @@ struct node_topology {
struct process_list_item {
int ready;
int timeout;
struct task_struct *task;
struct list_head list;
wait_queue_head_t pli_wq;
@ -313,6 +314,7 @@ struct mcctrl_part_exec {
struct mutex lock;
int nr_processes;
int nr_processes_left;
int process_rank;
cpumask_t cpus_used;
struct list_head pli_list;
};

View File

@ -1019,7 +1019,8 @@ static const struct procfs_entry tid_entry_stuff[] = {
static const struct procfs_entry pid_entry_stuff[] = {
PROC_REG("auxv", S_IRUSR, NULL),
PROC_REG("cgroup", S_IXUSR, NULL),
/* Support the case where McKernel process retrieves its job-id under the Fujitsu TCS suite. */
// PROC_REG("cgroup", S_IXUSR, NULL),
// PROC_REG("clear_refs", S_IWUSR, NULL),
PROC_REG("cmdline", S_IRUGO, NULL),
// PROC_REG("comm", S_IRUGO|S_IWUSR, NULL),

View File

@ -222,6 +222,14 @@ int translate_rva_to_rpa(ihk_os_t os, unsigned long rpt, unsigned long rva,
pgsize = 1UL << offsh;
rpa = pt[ix] & ((1UL << 52) - 1) & ~(pgsize - 1);
rpa |= rva & (pgsize - 1);
/* For GB pages, just report regular 2MB page */
if (offsh == 30) {
pgsize = 1UL << 21;
dprintk("%s: GB page translated 0x%lx -> 0x%lx, pgsize: %lu\n",
__FUNCTION__, rva, rpa, pgsize);
}
ihk_device_unmap_virtual(ihk_os_to_dev(os), pt, PAGE_SIZE);
ihk_device_unmap_memory(ihk_os_to_dev(os), phys, PAGE_SIZE);
error = 0;
@ -799,7 +807,7 @@ static int rus_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
packet = (struct ikc_scd_packet *)mcctrl_get_per_thread_data(ppd, current);
if (!packet) {
error = -ENOENT;
ret = VM_FAULT_SIGBUS;
printk("%s: no packet registered for TID %d\n",
__FUNCTION__, task_pid_vnr(current));
goto put_and_out;
@ -1174,6 +1182,7 @@ struct pager_create_result {
int maxprot;
uint32_t flags;
size_t size;
char path[PATH_MAX];
};
enum {
@ -1192,6 +1201,33 @@ enum {
MF_END
};
static int pager_get_path(struct file *file, char *path) {
int error = 0;
char *pathbuf, *fullpath;
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
if (!pathbuf) {
printk("%s: ERROR: allocating path\n", __FUNCTION__);
error = -ENOMEM;
goto out;
}
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (!IS_ERR(fullpath)) {
memcpy(path, fullpath, strlen(fullpath));
}
else {
path[0] = 0;
}
out:
if (pathbuf) {
kfree(pathbuf);
}
return error;
}
static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
{
ihk_device_t dev = ihk_os_to_dev(os);
@ -1286,7 +1322,10 @@ static int pager_req_create(ihk_os_t os, int fd, uintptr_t result_pa)
dprintk("%s: filename: %s, premap & zerofill\n",
__FUNCTION__, fullpath);
}
else if (strstr(fullpath, "libmpi") != NULL) {
else if (strstr(fullpath, "libmpi") ||
strstr(fullpath, "libiomp") ||
strstr(fullpath, "libpthread") ||
strstr(fullpath, "libc.so")) {
mf_flags = MF_PREFETCH;
dprintk("%s: filename: %s, prefetch\n",
__FUNCTION__, fullpath);
@ -1325,6 +1364,7 @@ found:
phys = ihk_device_map_memory(dev, result_pa, sizeof(*resp));
resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0);
if (!resp) {
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
@ -1335,10 +1375,18 @@ found:
resp->maxprot = maxprot;
resp->flags = mf_flags;
resp->size = st.size;
error = pager_get_path(file, resp->path);
if (error) {
goto out_unmap;
}
error = 0;
out_unmap:
ihk_device_unmap_virtual(dev, resp, sizeof(*resp));
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
error = 0;
out:
if (newpager) {
kfree(newpager);
@ -1570,6 +1618,7 @@ struct pager_map_result {
uintptr_t handle;
int maxprot;
int8_t padding[4];
char path[PATH_MAX];
};
static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
@ -1624,20 +1673,22 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
maxprot |= PROT_EXEC;
}
down_write(&current->mm->mmap_sem);
prot_and_flags = MAP_SHARED |
(prot_and_flags & (MAP_POPULATE | MAP_LOCKED));
#define ANY_WHERE 0
if (prot_and_flags & MAP_LOCKED) prot_and_flags |= MAP_POPULATE;
#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
down_write(&current->mm->mmap_sem);
va = do_mmap_pgoff(file, ANY_WHERE, len, maxprot,
MAP_SHARED | (prot_and_flags & (MAP_POPULATE | MAP_LOCKED)), pgoff);
#endif
prot_and_flags, pgoff);
up_write(&current->mm->mmap_sem);
#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
va = vm_mmap(file, ANY_WHERE, len, maxprot, MAP_SHARED |
(prot_and_flags & (MAP_POPULATE | MAP_LOCKED)), pgoff << PAGE_SHIFT);
#else
va = vm_mmap(file, ANY_WHERE, len, maxprot,
prot_and_flags, pgoff << PAGE_SHIFT);
#endif
if (IS_ERR_VALUE(va)) {
@ -1657,6 +1708,7 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
phys = ihk_device_map_memory(dev, result_rpa, sizeof(*resp));
resp = ihk_device_map_virtual(dev, phys, sizeof(*resp), NULL, 0);
if (!resp) {
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
printk("%s: ERROR: invalid response structure address\n",
__FUNCTION__);
error = -EINVAL;
@ -1665,13 +1717,16 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
resp->handle = (uintptr_t)pager;
resp->maxprot = maxprot;
ihk_device_unmap_virtual(dev, resp, sizeof(*resp));
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
error = pager_get_path(file, resp->path);
if (error) {
goto out_unmap;
}
error = down_interruptible(&ppd->devobj_pager_lock);
if (error) {
error = -EINTR;
goto out;
goto out_unmap;
}
list_add_tail(&pager->list, &ppd->devobj_pager_list);
@ -1680,6 +1735,10 @@ static int pager_req_map(ihk_os_t os, int fd, size_t len, off_t off,
pager = 0;
error = 0;
out_unmap:
ihk_device_unmap_virtual(dev, resp, sizeof(*resp));
ihk_device_unmap_memory(dev, phys, sizeof(*resp));
out:
if (file) {
fput(file);
@ -1854,6 +1913,7 @@ static int pager_req_unmap(ihk_os_t os, uintptr_t handle)
kfree(pager);
out:
mcctrl_put_per_proc_data(ppd);
return error;
}
@ -2005,6 +2065,17 @@ void __return_syscall(ihk_os_t os, struct ikc_scd_packet *packet,
/* Map response structure and notify offloading thread */
res->ret = ret;
res->stid = stid;
res->private_data = 0;
/* Special case for open() to return private_data */
if (packet->req.number == __NR_open && ret > 0) {
struct fd f;
f = fdget(ret);
if (f.file) {
res->private_data = f.file->private_data;
fdput(f);
}
}
if (__notify_syscall_requester(os, packet, res) < 0) {
printk("%s: WARNING: failed to notify PID %d\n",

View File

@ -1207,7 +1207,7 @@ sysfsm_unlink(struct sysfsm_data *sdp, const char *path0, int flags)
goto out;
}
if (!flags & SYSFS_UNLINK_KEEP_ANCESTOR) {
if (!(flags & SYSFS_UNLINK_KEEP_ANCESTOR)) {
cleanup_ancestor(dirp);
}

View File

@ -16,7 +16,7 @@ ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 262144 -a ${LINUX_VERSION_CODE} -lt 262400 ]; then echo "linux-4.0.9"; else echo "none"; fi)
endif
ifeq ($(BUILD_MODULE),none)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 243680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
BUILD_MODULE=$(shell if [ ${LINUX_VERSION_CODE} -ge 263680 -a ${LINUX_VERSION_CODE} -lt 263936 ]; then echo "linux-4.6.7"; else echo "none"; fi)
endif
endif
ifeq ($(BUILD_MODULE_TMP),rhel)
@ -33,7 +33,7 @@ endif
modules:
ifneq ($(BUILD_MODULE),none)
@(cd $(BUILD_MODULE); make modules)
+@(cd $(BUILD_MODULE); make modules)
endif
clean:

View File

@ -10,16 +10,16 @@ MANDIR=@MANDIR@
MCKERNEL_INCDIR=@MCKERNEL_INCDIR@
MCKERNEL_LIBDIR=@MCKERNEL_LIBDIR@
KDIR ?= @KDIR@
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH}
ARCH=@ARCH@
CFLAGS=-Wall -O -I. -I$(VPATH)/arch/${ARCH} -I${IHKDIR} -I@abs_builddir@/../../../ihk/linux/include
LDFLAGS=@LDFLAGS@
RPATH=$(shell echo $(LDFLAGS)|awk '{for(i=1;i<=NF;i++){if($$i~/^-L/){w=$$i;sub(/^-L/,"-Wl,-rpath,",w);print w}}}')
VPATH=@abs_srcdir@
TARGET=mcexec libsched_yield ldump2mcdump.so
@uncomment_if_ENABLE_MEMDUMP@TARGET+=eclair
LIBS=@LIBS@
ARCH=@ARCH@
IHKDIR ?= $(VPATH)/../../../ihk/linux/include/
MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread
MCEXEC_LIBS=-lmcexec -lrt -lnuma -pthread -L@abs_builddir@/../../../ihk/linux/user -lihk -Wl,-rpath,$(MCKERNEL_LIBDIR)
ENABLE_QLMPI=@ENABLE_QLMPI@
ifeq ($(ENABLE_QLMPI),yes)
@ -40,10 +40,10 @@ mcexec: mcexec.c libmcexec.a
# POSTK_DEBUG_ARCH_DEP_34, eclair arch depend separate.
ifeq ($(ARCH), arm64)
eclair: eclair.c arch/$(ARCH)/arch-eclair.c
$(CC) -I.. -I. -I./arch/$(ARCH)/include -I$(VPATH)/.. -I$(VPATH) -I$(VPATH)/arch/$(ARCH)/include -I${IHKDIR} $(CFLAGS) -o $@ $^ $(LIBS)
$(CC) -I.. -I. -I./arch/$(ARCH)/include -I$(VPATH)/.. -I$(VPATH) -I$(VPATH)/arch/$(ARCH)/include $(CFLAGS) -o $@ $^ $(LIBS)
else
eclair: eclair.c
$(CC) $(CFLAGS) -I${IHKDIR} -o $@ $^ $(LIBS)
eclair: eclair.c arch/$(ARCH)/arch-eclair.c
$(CC) -I.. -I$(VPATH) -I$(VPATH)/arch/$(ARCH)/include $(CFLAGS) -o $@ $^ $(LIBS)
endif
ldump2mcdump.so: ldump2mcdump.c
@ -53,7 +53,7 @@ libsched_yield: libsched_yield.c
$(CC) -shared -fPIC -Wl,-soname,sched_yield.so.1 -o libsched_yield.so.1.0.0 $^ -lc -ldl
libmcexec.a::
(cd arch/${ARCH}; make)
+(cd arch/${ARCH}; $(MAKE))
libqlmpi.so: qlmpilib.c
$(MCC) $(CFLAGS) $(LDFLAGS) -shared -fPIC -o $@ $<
@ -77,18 +77,19 @@ ql_talker: ql_talker.o
$(CC) $^ $(CFLAGS) -o $@
clean::
(cd arch/${ARCH}; make clean)
(cd arch/${ARCH}; $(MAKE) clean)
$(RM) $(TARGET) *.o
.PHONY: all clean install
install::
(cd arch/${ARCH}; make install)
(cd arch/${ARCH}; $(MAKE) install)
mkdir -p -m 755 $(BINDIR)
install -m 755 mcexec $(BINDIR)
mkdir -p -m 755 $(MCKERNEL_LIBDIR)
install -m 755 ldump2mcdump.so $(MCKERNEL_LIBDIR)
install -m 755 libsched_yield.so.1.0.0 $(MCKERNEL_LIBDIR)
mkdir -p -m 755 $(MANDIR)/man1
install -m 644 mcexec.1 $(MANDIR)/man1/mcexec.1
ifeq ($(ENABLE_QLMPI),yes)
install -m 644 ../include/qlmpilib.h $(MCKERNEL_INCDIR)

View File

@ -9,12 +9,15 @@ LIBS=@LIBS@
all: $(TARGET)
../../libmcexec.a: archdep.o
$(AR) cr ../../libmcexec.a archdep.o
../../libmcexec.a: archdep.o arch_syscall.o
$(AR) cr ../../libmcexec.a archdep.o arch_syscall.o
archdep.o: archdep.S
$(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $<
arch_syscall.o: arch_syscall.c
$(CC) -c -I${KDIR} $(CFLAGS) $(EXTRA_CFLAGS) -fPIE -pie -pthread $<
clean:
$(RM) $(TARGET) *.o

Some files were not shown because too many files have changed in this diff Show More