Compare commits

...

22 Commits
1.7.4 ... 1.7.9

Author SHA1 Message Date
90c1ceef45 release: 1.7.9: fix smp_ihk_os_shutdown()-related double free
Change-Id: I408dc69b41d9643548226c15c67fcbd8197acb92
2021-03-17 18:21:25 +09:00
4f1b505550 docs: migrate to github.com/ihkmckernel
Change-Id: Idd8fed88545231b4aca290e1b54cbc2d2dff2e9e
2021-03-17 08:43:02 +00:00
051c0dcdd8 overlay_path: Fix resolution of symbolic link under /sys/
Change-Id: I650e72fb335aa72256d3b129a65c09bbd7cf26d3
Refs: #1463
2021-03-17 08:18:46 +00:00
09173d353c mcctrl_wakeup_desc: refcount and fix timeouts
Change-Id: I14b34f031ffb10bfac6cef07d81f53a8dece767b
2021-03-17 03:36:35 +00:00
d5c5023bf8 epoll/epoll_wait/ppoll: special handling in syscall offload
Change-Id: I792eb91c349d0ce942179996328c6f89f186ba31
2021-03-17 03:36:35 +00:00
e3493bd0be docs: lift limitations and fix ppn example
Change-Id: Id78e7db09767d5dd8a3dc5b9f911b9026608b021
2021-03-17 03:31:12 +00:00
44261678f7 cmake: fix condition to turn on/off ENABLE_KRM_WORKAROUND
Change-Id: I1a8efe88ffb1283d0343571f340a3b5715318e7d
2021-03-17 02:57:19 +00:00
6e4a29a422 docs: spec: fix description of IHK_RESERVE_MEM_MAX_SIZE_RATIO_ALL
Change-Id: I7af95524d87721fa1ce34bc560eddc947117f5f8
2021-03-15 15:32:08 +09:00
2039139380 release: 1.7.8: fix ihklib/ihk_reserve_cpu when using krm
Change-Id: I57235d51f51ae7327cb08a9e3ae56be995157100
2021-03-12 12:54:56 +09:00
c80b112ce7 release: 1.7.7: fix fput and mckernel.spec
Change-Id: I74f7530b067d44790e3f014479f580867387584a
2021-03-11 08:09:07 +00:00
4a05024656 spec: cmake-config cmake paramters
Change-Id: Ic0e7f62d9172f31afe90297bdd22b8e50cc6fc9e
2021-03-11 07:19:04 +00:00
7a04c6eb5c ihkmond: redirect kmsg to /dev/kmsg line by line
Change-Id: Iafc9d0eb47696073434dcc869a29336a51b8c50e
2021-03-11 16:11:17 +09:00
3e00189de0 kprintf: fix checking if interrupt is disabled
Change-Id: I2ee1a1e2438ae761c4136593953ede2738bc6f74
2021-03-11 07:03:04 +00:00
c94cf8e6f0 mcexec: fput executable just after its contents is transferred
Change-Id: I3fae841bd7341bca030fd6b7eceffa068c9e0f4e
2021-03-11 07:03:04 +00:00
ee974b200d mcexec_open_exec: fix missing fput on error
Change-Id: I3ac94e336dc54ec313e69c0fa85c17086dc256fd
2021-03-11 07:03:04 +00:00
546cafe6bc release: 1.7.6: fix ihk_reserve_mem_conf
Change-Id: I767f8eac655af9200f733c21353b1e141007df17
2021-03-11 15:22:36 +09:00
9dd4d99a1a docs: spec: ihk_reserve_mem_conf*: apply change only to the next reservation
Change-Id: Iaafd2ca4d96f227d03e9910a36b27801fb1e3da4
2021-03-11 15:17:44 +09:00
3a6273777a test: uti/tofu, issues/1507+1519: fix README
Change-Id: I3060e1273c8ef6a1b392a2c678da3bc02a25a4f8
2021-03-11 03:59:57 +00:00
daed585347 release: 1.7.5: fix ihk_*str() functions
Change-Id: Ic412029f856f34a10724f03e36f211f6026acd8e
2021-03-11 12:18:43 +09:00
11d7229525 docs: spec: ihk_reserve_mem_conf_str: use defaults for those not specified
Change-Id: I7cfddd3203b952cabb919ea6401e226e151e696a
2021-03-11 02:26:18 +00:00
e43d52df20 Revert "mcexec_open_exec: make fput and add to mckernel_exec_files atomic"
This reverts commit c80ea0ed23.

Change-Id: I0541e8af5157c7128f8774f6581cc207d13b649a
2021-03-10 14:21:57 +09:00
1c0da3c5b9 Revert "mcexec_open_exec: guard fput and add to mckernel_exec_files with spin_lock_irqsave"
This reverts commit cba263ff12.

Change-Id: Ifcd03a2048a3f9d6c155dd8ecd522081b5dde276
2021-03-10 14:21:49 +09:00
31 changed files with 842 additions and 250 deletions

View File

@ -7,7 +7,7 @@ endif (NOT CMAKE_BUILD_TYPE)
enable_language(C ASM)
project(mckernel C ASM)
set(MCKERNEL_VERSION "1.7.4")
set(MCKERNEL_VERSION "1.7.9")
# See "Fedora Packaging Guidelines -- Versioning"
set(MCKERNEL_RELEASE "")
@ -105,10 +105,17 @@ execute_process(COMMAND bash -c "rpm -qi FJSVpxkrm-plugin-mckernel | awk '$1 ==
OUTPUT_VARIABLE KRM_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE)
message("KRM_VERSION: ${KRM_VERSION}")
if(NOT "${KRM_VERSION}" STREQUAL "" AND "${KRM_VERSION}" VERSION_LESS_EQUAL 4.0.1)
option(ENABLE_KRM_WORKAROUND "krm workaround" ON)
else()
execute_process(COMMAND bash -c "rpm -qi FJSVpxkrm-plugin-mckernel | awk '$1 == \"Release\" && $2 == \":\" { print $3 }'"
OUTPUT_VARIABLE KRM_RELEASE OUTPUT_STRIP_TRAILING_WHITESPACE)
message("KRM_RELEASE: ${KRM_RELEASE}")
if("${KRM_VERSION}" STREQUAL "")
option(ENABLE_KRM_WORKAROUND "krm workaround" OFF)
elseif("${KRM_VERSION}" VERSION_GREATER_EQUAL 4.0.2 OR
("${KRM_VERSION}" VERSION_EQUAL 4.0.1 AND "${KRM_RELEASE}" VERSION_GREATER_EQUAL 25.13.1.0))
option(ENABLE_KRM_WORKAROUND "krm workaround" OFF)
else()
option(ENABLE_KRM_WORKAROUND "krm workaround" ON)
endif()
if(ENABLE_KRM_WORKAROUND)
@ -124,6 +131,14 @@ if(ENABLE_FUGAKU_DEBUG)
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DENABLE_FUGAKU_DEBUG")
endif()
# redirect kernel messages to Linux's /dev/kmsg
option(ENABLE_KMSG_REDIRECT "Redirect kernel message to Linux's /dev/kmsg" OFF)
if(ENABLE_KMSG_REDIRECT)
add_definitions(-DENABLE_KMSG_REDIRECT)
set(KBUILD_C_FLAGS "${KBUILD_C_FLAGS} -DENABLE_KMSG_REDIRECT")
endif()
option(PROFILE_ENABLE "System call profile" ON)
if(PROFILE_ENABLE)
add_definitions(-DPROFILE_ENABLE)
@ -340,4 +355,5 @@ message("ENABLE_WERROR: ${ENABLE_WERROR}")
message("ENABLE_UBSAN: ${ENABLE_UBSAN}")
message("ENABLE_LINUX_WORK_IRQ_FOR_IKC: ${ENABLE_LINUX_WORK_IRQ_FOR_IKC}")
message("ENABLE_PER_CPU_ALLOC_CACHE: ${ENABLE_PER_CPU_ALLOC_CACHE}")
message("ENABLE_KMSG_REDIRECT: ${ENABLE_KMSG_REDIRECT}")
message("-------------------------------")

View File

@ -912,7 +912,6 @@ unsigned long cpu_enable_interrupt_save(void)
return flags;
}
#ifdef ENABLE_FUGAKU_HACKS
int cpu_interrupt_disabled(void)
{
unsigned long flags;
@ -925,7 +924,6 @@ int cpu_interrupt_disabled(void)
: "memory");
return (flags == masked);
}
#endif
#else /* defined(CONFIG_HAS_NMI) */
@ -989,6 +987,18 @@ unsigned long cpu_enable_interrupt_save(void)
: "memory");
return flags;
}
int cpu_interrupt_disabled(void)
{
unsigned long flags;
asm volatile(
"mrs %0, daif // arch_local_irq_save\n"
: "=r" (flags)
:
: "memory");
return !!(flags & 0x2);
}
#endif /* defined(CONFIG_HAS_NMI) */
/* we not have "pause" instruction, instead "yield" instruction */

View File

@ -1273,6 +1273,15 @@ unsigned long cpu_enable_interrupt_save(void)
return flags;
}
int cpu_interrupt_disabled(void)
{
unsigned long flags;
asm volatile("pushf; pop %0" : "=r"(flags) : : "memory", "cc");
return !(flags & 0x200);
}
/*@
@ behavior valid_vector:
@ assumes 32 <= vector <= 255;

View File

@ -1,3 +1,275 @@
=============================================
Version 1.7.9 (Mar 17, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
#. ihklib: ihk_reserve_mem_conf*: fix default values
#. smp_ihk_os_shutdown: fix memory leak
#. smp_ihk_os_shutdown: prevent double free
#. __ihk_os_shutdown: fix smp_ihk_os_shutdown()-related double free
#. smp_ihk_os_panic_notifier: exclude memory from Linux dump with default setting
#. smp_ihk_os_panic_notifier: exclude memory from Linux dump while booting, on timeout
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
#. mcctrl_wakeup_desc: refcount and fix timeouts
=============================================
Version 1.7.8 (Mar 12, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
#. ihklib: ihk_reserve_cpu: fix job cpu check when using krm
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
N/A
=============================================
Version 1.7.7 (Mar 11, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
N/A
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
#. mcexec: fput executable just after its contents is transferred
#. spec: cmake-config cmake parameters
=============================================
Version 1.7.6 (Mar 11, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
#. ihklib: ihk_reserve_mem_conf*: apply change only to the next reservation
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
N/A
=============================================
Version 1.7.5 (Mar 11, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
#. ihklib: fix cgroup cpuset.cpus/mems check when using krm
#. ihklib: ihk_reserve_mem_conf_str: set default values to those not specified
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
N/A
=============================================
Version 1.7.4 (Mar 7, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
N/A
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
N/A
=============================================
Version 1.7.3 (Mar 5, 2021)
=============================================
----------------------
IHK major updates
----------------------
N/A
------------------------
IHK major bug fixes
------------------------
N/A
----------------------
McKernel major updates
----------------------
N/A
------------------------
McKernel major bug fixes
------------------------
N/A
=============================================
Version 1.7.2 (Mar 5, 2021)
=============================================
----------------------
IHK major updates
----------------------
#. ihklib: add *_str() functions for reserve, assign, IKC-map, kargs
#. smp: make smp_call_func() arch independent
------------------------
IHK major bug fixes
------------------------
#. ihklib: ihk_reserve_mem: fix capped best-effort
#. TO RESET: fake missing NUMA node pieces, 90% memory limit
#. ihklib: ihk_reserve_mem_conf: range-check for IHK_RESERVE_MEM_MAX_SIZE_RATIO_ALL
#. ihklib: ihk_os_kargs: check if "hidos" is included
#. SMP: omit slab/slub shrink, use 95% limit by default
#. check cpu / numa cgroup set by krm
#. SMP: __ihk_smp_reserve_mem: add __GFP_COMP to __GFP_ATOMIC allocation
#. ihk_register_device: record minor to IHK device object
----------------------
McKernel major updates
----------------------
#. mcexec: memory policy control by environmental variable
#. mempolicy: Support MPOL_INTERLEAVE
#. uti: futex call function in mcctrl
#. uti: integrate libuti and redirect to mck/libuti.so
#. uti: integrate syscall_intercept
#. shmobj: support large page
#. xpmem: support large page
#. MM: handle zero_at_free in page faults
------------------------
McKernel major bug fixes
------------------------
#. TO RESET: stack changes
#. Tofu: keep track of stags per memory range
#. Tofu: match page sizes to MBPT and fault PTEs if not present
#. Tofu: fix phys addr calculation for contiguous pages in MBPT/BCH update
#. rus_vm_fault: vmf_insert_pfn: treat VM_FAULT_NOPAGE as success
#. Tofu: mcctrl side MMU notifier and CQ/BCH cleanup
#. copy_user_ranges: copy straight_start of struct vm_range
#. mcctrl: abort on invalid addr in mcexec_transfer_image()
#. mcctrl: fix access to uninitialized usrdata->cpu_topology_list
#. mcexec: propagate error in __NR_gettid handler
#. mcexec_transfer_image(): map exact size of remote memory (instead of forcing PAGE_SIZE)
#. xpmem: fault stack area of remote process if VM range doesn't yet exist
#. Tofu: fault stack area if VM range doesn't exist in STAG registration
#. __mcctrl_os_read_write_cpu_register: fix timeout
#. mbind: Use range_policy's numamask as priority on MPOL_BIND
#. migrate: Don't migrate on in-kernel interrupt
#. Send a signal to mcexec after switching to that process.
#. uti: fix syscall response is mis-consumed by __do_in_kernel_irq_syscall
#. uti: fix handling UTI_CPU_SET env
#. do_execveat: kill instead of panic when init_process_stack fails
#. remote_page_fault is handled by the offloaded thread.
#. coredump: fix behavior when gencore fail
#. xpmem: truncates the size of xpmem_attach at the page boundary (workaround for fjmpi)
#. __mcctrl_os_read_write_cpu_register: spin timeout in mcctrl_ikc_send_wait()
=============================================
Version 1.7.1 (Dec 23, 2020)
=============================================
----------------------
IHK major updates
----------------------
#. d5d5c23 Tofu: support for barrier gate
#. Tofu: proper cleanup of premapped DMA regions
#. Tofu: initial version
#. SMP: try with GFP_ATOMIC as well in mem reserve
------------------------
IHK major bug fixes
------------------------
#. ihklib: ihk(_os)_query_{cpu,mem}: allow to pass empty array
#. SMP: non compound page free and GFP_ATOMIC
#. ihk_get_num_os_instances: don't open /dev/mcdN
#. ihklib: ihk_create_os_str: fix variable prefix
----------------------
McKernel major updates
----------------------
#. stragiht map: creates a straight map covering the whole physical memory, and gives virtual address ranges out of it to maps to which physical pages are allocated on map
#. free-time, lazy, potentially Linux-side page-zeroing
#. Tofu built-in driver: supports memory registration and barrier gate setup
#. kmalloc cache
------------------------
McKernel major bug fixes
------------------------
#. mmap: return -EINVAL for non-anonymous, MAP_HUGETLB map
#. kernel: increase stack size
#. Tofu: proper cleanup of device files when mcexec gets killed
=============================================
Version 1.7.0 (Nov 25, 2020)
=============================================

View File

@ -1,11 +1,5 @@
Contact
=======
Please give your feedback to us via one of the following mailing lists.
Subscription via
`www.pccluster.org <http://www.pccluster.org/mailman/listinfo/mckernel-users>`__
is needed.
- English: mckernel-users@pccluster.org
- Japanese: mckernel-users-jp@pccluster.org
Please give your feedback to us via the following mailing list: ihkmckernel@googlegroups.com

View File

@ -97,7 +97,7 @@ Clone the source code:
mkdir -p ~/src/ihk+mckernel/
cd ~/src/ihk+mckernel/
git clone --recursive -b development https://github.com/RIKEN-SysSoft/mckernel.git
git clone --recursive -b development https://github.com/ihkmckernel/mckernel.git
(Optional) Checkout to the specific branch or version:
@ -166,22 +166,6 @@ Create the tarball and the spec file:
make dist
cp mckernel-<version>.tar.gz <rpmbuild>/SOURCES
(optional) Edit the following line in ``scripts/mckernel.spec`` to change
cmake options. For example:
::
%cmake -DCMAKE_BUILD_TYPE=Release \
-DUNAME_R=%{kernel_version} \
-DKERNEL_DIR=%{kernel_dir} \
%{?cmake_libdir:-DCMAKE_INSTALL_LIBDIR=%{cmake_libdir}} \
%{?build_target:-DBUILD_TARGET=%{build_target}} \
%{?toolchain_file:-DCMAKE_TOOLCHAIN_FILE=%{toolchain_file}} \
-DENABLE_TOFU=ON -DENABLE_FUGAKU_HACKS=ON \
-DENABLE_KRM_WORKAROUND=OFF -DWITH_KRM=ON \
-DENABLE_FUGAKU_DEBUG=OFF -DENABLE_UTI=ON \
.
Create the rpm package:
When not cross-compiling:

Binary file not shown.

View File

@ -649,7 +649,9 @@ IHKはLinuxに以下の機能を提供する。
\begin{tabular}[t]{@{}l@{}}
{\quad} \texttt{int ihk\_reserve\_mem\_conf(int index, int key, void *value)}\\
\end{tabular}
\subsubsection*{説明}{\quad} \texttt{index}で指定されたIHKデバイスに対する\texttt{ihk\_reserve\_mem()}の動作を\texttt{key}\texttt{value}のペアで指定したものに変更する。\texttt{value}は値へのポインタで指定する。\texttt{key}\texttt{value}のペアの意味は以下のように定義される。
\subsubsection*{説明}{\quad} \texttt{index}で指定されたIHKデバイスに対する\texttt{ihk\_reserve\_mem()}の動作を\texttt{key}\texttt{value}のペアで指定したものに変更する。なお、設定は次の1回の予約に限り有効で、予約後にはデフォルト設定に戻る。
\texttt{value}は値へのポインタで指定する。\texttt{key}\texttt{value}のペアの意味は以下のように定義される。
\subsubsection*{\texttt{IHK\_RESERVE\_MEM\_BALANCED\_\{ENABLE,BEST\_EFFORT,VARIANCE\_LIMIT\}}}
\verb|IHK_RESERVE_MEM_BALANCED_ENABLE|(型は\verb|int|、デフォルトは0が非ゼロの場合は、NUMAードごとの予約サイズがNUMAード間でなるべく均等になるように予約する。目的は、NUMAードごとのメモリ空き容量にNUMAード間でばらつきがあり、またそれらの空き容量が事前にわからないようなシステムで、合計予約サイズをより大きくすることである。ステップは以下の通り。
@ -664,7 +666,7 @@ IHKはLinuxに以下の機能を提供する。
このパラメタの目的は、Linuxによる空き領域の分断化が激しい状況においてメモリ予約処理時間を抑えることである。上記の状況で予約処理時間が長くなるのは、小さいサイズでの物理連続領域が大量に存在するので、小さいサイズでの要求回数が非常に大きくなるためである。
\subsubsection*{\texttt{IHK\_RESERVE\_MEM\_MAX\_SIZE\_RATIO\_ALL}}
\verb|ihk_reserve_mem()|でサイズに-1を指定した場合と\verb|IHK_RESERVE_MEM_BALANCED_ENABLE|に非ゼロを指定した場合に用いられる予約サイズを、予約時点で測定した空き容量に指定した値を乗じたものにする。なお、ゼロ以下の値または98より大きい値を設定しようとすると\verb:-EINVAL:を返す。また、デフォルト設定は98\%である。
\verb|ihk_reserve_mem()|でサイズに-1を指定した場合と\verb|IHK_RESERVE_MEM_BALANCED_ENABLE|に非ゼロを指定した場合に用いられる予約サイズを、予約時点で測定した空き容量に指定した値を乗じたものにする。なお、ゼロ以下の値を設定しようとしたり、また富岳では95、その他のシステムでは98より大きい値を設定しようとしたりすると\verb:-EINVAL:を返す。また、デフォルト設定は富岳では95\%、その他のシステムでは98\%である。
目的は、Linuxによる空き領域の分断化が激しい状況においてメモリ予約処理時間を抑えること、また予約時にLinuxのプロセスのメモリ要求が満たされない状況にならないようにすることである。
@ -686,7 +688,7 @@ IHKはLinuxに以下の機能を提供する。
\subsubsection{設定リストによるメモリ予約動作設定}
\subsubsection*{書式}{\quad} \verb:int ihk_reserve_mem_conf_str(int dev_index, const char *envp, int num_env);:
\subsubsection*{説明}{\quad} \verb:dev_index:で指定されたIHKデバイスに対し、\verb:envp:と\verb:num_env:で指定された文字列形式の設定リストに従ってメモリ予約の動作設定を行う。本関数は特権ユーザのみが呼び出せる。
\subsubsection*{説明}{\quad} \verb:dev_index:で指定されたIHKデバイスに対し、\verb:envp:と\verb:num_env:で指定された文字列形式の設定リストに従ってメモリ予約の動作設定を行う。なお、設定は次の1回の予約に限り有効で、予約後にはデフォルト設定に戻る。本関数は特権ユーザのみが呼び出せる。
\verb:envp:は\verb:NULL:文字で結合された\verb:num_env:個の設定文字列からなる。各設定文字列は\verb:"KEY=VAL":の形式を持つ。設定可能な項目は以下の通り。
\begin{table}[!h]
@ -702,7 +704,7 @@ IHKはLinuxに以下の機能を提供する。
\end{tabular}
\vspace{-0em}
\end{table}
\\なお、これ以外の設定は無視される。
\\また、これ以外の設定項目は無視される。
\FloatBarrier
\subsubsection*{戻り値}

View File

@ -87,14 +87,14 @@ executable:
``<processes-per-node>`` is the number of the processes per node and
calculated by (number of MPI processes) / (number of nodes).
For example, ``<processes-per-node>`` equals to 4 (=32/8) when
For example, ``<processes-per-node>`` equals to 4 (=8/2) when
specifying the number of processes and nodes as follows with
Fujitsu Technical Computing Suite.
MPICH.
.. code-block:: none
#PJM --mpi "proc=32"
#PJM -L "node=8"
mpirun -n 8 -hosts host1,host2 ./cpi
(Advanced) When using Utility Thread offloading Interface (UTI)
---------------------------------------------------------------
@ -112,11 +112,11 @@ Add ``--enable-uti`` option to ``mcexec``:
Limitations
===========
1. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
#. Pseudo devices such as /dev/mem and /dev/zero are not mmap()ed
correctly even if the mmap() returns a success. An access of their
mapping receives the SIGSEGV signal.
2. clone() supports only the following flags. All the other flags cause
#. clone() supports only the following flags. All the other flags cause
clone() to return error or are simply ignored.
- CLONE_CHILD_CLEARTID
@ -126,32 +126,32 @@ Limitations
- CLONE_SIGHAND
- CLONE_VM
3. PAPI has the following restriction.
#. PAPI has the following restriction.
- Number of counters a user can use at the same time is up to the
number of the physical counters in the processor.
4. msync writes back only the modified pages mapped by the calling
#. msync writes back only the modified pages mapped by the calling
process.
5. The following syscalls always return the ENOSYS error.
#. The following syscalls always return the ENOSYS error.
- migrate_pages()
- move_pages()
- set_robust_list()
6. The following syscalls always return the EOPNOTSUPP error.
#. The following syscalls always return the EOPNOTSUPP error.
- arch_prctl(ARCH_SET_GS)
- signalfd()
7. signalfd4() returns a fd, but signal is not notified through the fd.
#. signalfd4() returns a fd, but signal is not notified through the fd.
8. set_rlimit sets the limit values but they are not enforced.
#. set_rlimit sets the limit values but they are not enforced.
9. Address randomization is not supported.
#. Address randomization is not supported.
10. brk() extends the heap more than requestd when -h (extend-heap-by=)
#. brk() extends the heap more than requestd when -h (extend-heap-by=)
option of mcexec is used with the value larger than 4 KiB.
syscall_pwrite02 of LTP would fail for this reason. This is because
the test expects that the end of the heap is set to the same address
@ -161,91 +161,84 @@ Limitations
than the requested. Therefore, the expected segmentation violation
doesnt occur.
11. setpriority()/getpriority() wont work. They might set/get the
priority of a random mcexec thread. This is because theres no fixed
correspondence between a McKernel thread which issues the system
call and a mcexec thread which handles the offload request.
#. setpriority()/getpriority() wont work. They might set/get the
priority of a random mcexec thread. This is because theres no fixed
correspondence between a McKernel thread which issues the system
call and a mcexec thread which handles the offload request.
12. mbind() can set the policy but it is not used when allocating
physical pages.
#. mbind() can set the policy but it is not used when allocating
physical pages.
13. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
set_mempolicy()/mbind() are not supported.
#. MPOL_F_RELATIVE_NODES and MPOL_INTERLEAVE flags for
set_mempolicy()/mbind() are not supported.
14. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
as the MPOL_PREFERRED policy. That is, the physical page allocator
doesnt give up the allocation when the specified nodes are running
out of pages but continues to search pages in the other nodes.
#. The MPOL_BIND policy for set_mempolicy()/mbind() works as the same
as the MPOL_PREFERRED policy. That is, the physical page allocator
doesnt give up the allocation when the specified nodes are running
out of pages but continues to search pages in the other nodes.
15. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
later. In addition, crash_kexec_post_notifiers kernel argument must
be given to Linux kernel.
#. Kernel dump on Linux panic requires Linux kernel CentOS-7.4 and
later. In addition, crash_kexec_post_notifiers kernel argument must
be given to Linux kernel.
16. setfsuid()/setfsgid() cannot change the id of the calling thread.
Instead, it changes that of the mcexec worker thread which takes the
system-call offload request.
#. setfsuid()/setfsgid() cannot change the id of the calling thread.
Instead, it changes that of the mcexec worker thread which takes the
system-call offload request.
17. mmap (hugeTLBfs): The physical pages corresponding to a map are
released when no McKernel process exist. The next map gets fresh
physical pages.
#. mmap (hugeTLBfs): The physical pages corresponding to a map are
released when no McKernel process exist. The next map gets fresh
physical pages.
18. Sticky bit on executable file has no effect.
#. Sticky bit on executable file has no effect.
19. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
process of booting McKernel due to the Linux bug, found in
Linux-3.10 and fixed in the later version. One way to circumvent
this is to always assign the same CPU set to McKernel.
#. Linux (RHEL-7 for x86_64) could hang when offlining CPUs in the
process of booting McKernel due to the Linux bug, found in
Linux-3.10 and fixed in the later version. One way to circumvent
this is to always assign the same CPU set to McKernel.
20. madvise:
#. madvise:
- MADV_HWPOISON and MADV_SOFT_OFFLINE always returns -EPERM.
- MADV_MERGEABLE and MADV_UNMERGEABLE always returns -EINVAL.
- MADV_HUGEPAGE and MADV_NOHUGEPAGE on file map returns -EINVAL
except on RHEL-8 for aarch64.
21. brk() and mmap() doesnt report out-of-memory through its return
value. Instead, page-fault reports the error.
#. brk() and mmap() doesnt report out-of-memory through its return
value. Instead, page-fault reports the error.
22. Anonymous mmap pre-maps requested number of pages when contiguous
pages are available. Demand paging is used when not available.
#. Anonymous mmap pre-maps requested number of pages when contiguous
pages are available. Demand paging is used when not available.
23. Mixing page sizes in anonymous shared mapping is not allowed. mmap
creates vm_range with one page size. And munmap or mremap that needs
the reduced page size changes the sizes of all the pages of the
vm_range.
#. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
(job-scheduler).
24. ihk_os_getperfevent() could time-out when invoked from Fujitsu TCS
(job-scheduler).
#. The behaviors of madvise and mbind are changed to do nothing and
report success as a workaround for Fugaku.
25. The behaviors of madvise and mbind are changed to do nothing and
report success as a workaround for Fugaku.
#. mmap() allows unlimited overcommit. Note that it corresponds to
setting sysctl ``vm.overcommit_memory`` to 1.
26. mmap() allows unlimited overcommit. Note that it corresponds to
setting sysctl ``vm.overcommit_memory`` to 1.
#. mlockall() is not supported and returns -EPERM.
27. mlockall() is not supported and returns -EPERM.
#. munlockall() is not supported and returns zero.
28. munlockall() is not supported and returns zero.
#. (Fujitsu TCS-only) A job following the one in which __mcctrl_os_read_write_cpu_register() returns ``-ETIME`` fails because xos_hwb related CPU state isn't finalized. You can tell if the function returned ``-ETIME`` by checking if the following line appeared in the Linux kernel message:
29. scheduling behavior is not Linux compatible. For example, sometimes one of the two processes on the same CPU continues to run after yielding.
::
30. (Fujitsu TCS-only) A job following the one in which __mcctrl_os_read_write_cpu_register() returns ``-ETIME`` fails because xos_hwb related CPU state isn't finalized. You can tell if the function returned ``-ETIME`` by checking if the following line appeared in the Linux kernel message:
__mcctrl_os_read_write_cpu_register: ERROR sending IKC msg: -62
::
You can re-initialize xos_hwb related CPU state by the following command:
__mcctrl_os_read_write_cpu_register: ERROR sending IKC msg: -62
::
You can re-initialize xos_hwb related CPU state by the following command:
sudo systemctl restart xos_hwb
::
#. System calls can write the mcexec VMAs with PROT_WRITE flag not
set. This is because we never turn off PROT_WRITE of the mcexec
VMAs to circumvent the issue "set_host_vma(): do NOT read protect
Linux VMA".
sudo systemctl restart xos_hwb
31. System calls can write the mcexec VMAs with PROT_WRITE flag not
set. This is because we never turn off PROT_WRITE of the mcexec
VMAs to circumvent the issue "set_host_vma(): do NOT read protect
Linux VMA".
32. procfs entry creation done by Linux work queue could starve when
Linux CPUs are flooded with system call offloads. LTP-2019
sendmsg02 causes this issue.
#. procfs entry creation done by Linux work queue could starve when
Linux CPUs are flooded with system call offloads. LTP-2019
sendmsg02 causes this issue.

View File

@ -916,7 +916,7 @@ int __mcctrl_tof_utofu_release_handler(struct inode *inode, struct file *filp,
isp.arg = f2pfd->fd;
ret = mcctrl_ikc_send_wait(f2pfd->os, ppd->ikc_target_cpu,
&isp, -20, NULL, NULL, 0);
&isp, -1000, NULL, NULL, 0);
if (ret != 0) {
pr_err("%s: WARNING: IKC req for PID: %d, fd: %d failed\n",
__func__, f2pfd->pid, f2pfd->fd);

View File

@ -414,7 +414,7 @@ static void release_handler(ihk_os_t os, void *param)
dprintk("%s: SCD_MSG_CLEANUP_PROCESS, info: %p, cpu: %d\n",
__FUNCTION__, info, info->cpu);
ret = mcctrl_ikc_send_wait(os, info->cpu,
&isp, -20, NULL, NULL, 0);
&isp, -5000, NULL, NULL, 0);
if (ret != 0) {
printk("%s: WARNING: failed to send IKC msg: %d\n",
__func__, ret);
@ -513,8 +513,6 @@ static DECLARE_WAIT_QUEUE_HEAD(signalq);
struct mcctrl_signal_desc {
struct mcctrl_signal msig;
struct mcctrl_wakeup_desc wakeup;
void *addrs[1];
};
static long mcexec_send_signal(ihk_os_t os, struct signal_desc *sigparam)
@ -554,7 +552,7 @@ static long mcexec_send_signal(ihk_os_t os, struct signal_desc *sigparam)
isp.pid = sig.pid;
isp.arg = virt_to_phys(msigp);
rc = mcctrl_ikc_send_wait(os, sig.cpu, &isp, 0, &desc->wakeup,
rc = mcctrl_ikc_send_wait(os, sig.cpu, &isp, -1000, NULL,
&do_free, 1, desc);
if (rc < 0) {
printk("mcexec_send_signal: mcctrl_ikc_send ret=%d\n", rc);
@ -1799,7 +1797,8 @@ out:
}
LIST_HEAD(mckernel_exec_files);
static DEFINE_SPINLOCK(mckernel_exec_file_lock);
DEFINE_SEMAPHORE(mckernel_exec_file_lock);
struct mckernel_exec_file {
ihk_os_t os;
@ -1976,7 +1975,6 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
char *fullpath = NULL;
char *kfilename = NULL;
int len;
unsigned long flags;
if (os_ind < 0) {
return -EINVAL;
@ -1991,42 +1989,36 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
kfilename = kmalloc(PATH_MAX, GFP_KERNEL);
if (!kfilename) {
retval = -ENOMEM;
kfree(pathbuf);
goto out;
}
len = strncpy_from_user(kfilename, filename, PATH_MAX);
if (unlikely(len < 0)) {
retval = -EINVAL;
goto out;
goto out_free;
}
/* fget and list_add should not be interrupted by hardware interrupt */
spin_lock_irqsave(&mckernel_exec_file_lock, flags);
file = open_exec(kfilename);
retval = PTR_ERR(file);
if (IS_ERR(file)) {
spin_unlock_irqrestore(&mckernel_exec_file_lock, flags);
goto out;
goto out_free;
}
fullpath = d_path(&file->f_path, pathbuf, PATH_MAX);
if (IS_ERR(fullpath)) {
fput(file);
spin_unlock_irqrestore(&mckernel_exec_file_lock, flags);
retval = PTR_ERR(fullpath);
goto out;
goto out_put_file;
}
mcef = kmalloc(sizeof(*mcef), GFP_KERNEL);
if (!mcef) {
fput(file);
spin_unlock_irqrestore(&mckernel_exec_file_lock, flags);
retval = -ENOMEM;
goto out;
goto out_put_file;
}
memset(mcef, 0, sizeof(struct mckernel_exec_file)); /* debug */
down(&mckernel_exec_file_lock);
/* Find previous file (if exists) and drop it */
list_for_each_entry(mcef_iter, &mckernel_exec_files, list) {
if (mcef_iter->os == os && mcef_iter->pid == task_tgid_vnr(current)) {
@ -2047,15 +2039,22 @@ int mcexec_open_exec(ihk_os_t os, char * __user filename)
/* Create /proc/self/exe entry */
add_pid_entry(os_ind, task_tgid_vnr(current));
proc_exe_link(os_ind, task_tgid_vnr(current), fullpath);
spin_unlock_irqrestore(&mckernel_exec_file_lock, flags);
up(&mckernel_exec_file_lock);
dprintk("%d open_exec and holding file: %s\n", (int)task_tgid_vnr(current),
kfilename);
retval = 0;
out:
kfree(kfilename);
kfree(pathbuf);
return 0;
out_put_file:
fput(file);
out_free:
kfree(pathbuf);
kfree(kfilename);
out:
return retval;
}
@ -2063,14 +2062,13 @@ int mcexec_close_exec(ihk_os_t os, int pid)
{
struct mckernel_exec_file *mcef = NULL;
int found = 0;
int os_ind = ihk_host_os_get_index(os);
unsigned long flags;
int os_ind = ihk_host_os_get_index(os);
if (os_ind < 0) {
return EINVAL;
}
spin_lock_irqsave(&mckernel_exec_file_lock, flags);
down(&mckernel_exec_file_lock);
list_for_each_entry(mcef, &mckernel_exec_files, list) {
if (mcef->os == os && mcef->pid == pid) {
allow_write_access(mcef->fp);
@ -2083,7 +2081,7 @@ int mcexec_close_exec(ihk_os_t os, int pid)
}
}
spin_unlock_irqrestore(&mckernel_exec_file_lock, flags);
up(&mckernel_exec_file_lock);
return (found ? 0 : EINVAL);
}
@ -2243,8 +2241,6 @@ long mcctrl_perf_num(ihk_os_t os, unsigned long arg)
struct mcctrl_perf_ctrl_desc {
struct perf_ctrl_desc desc;
struct mcctrl_wakeup_desc wakeup;
void *addrs[1];
};
#define wakeup_desc_of_perf_desc(_desc) \
(&container_of((_desc), struct mcctrl_perf_ctrl_desc, desc)->wakeup)
@ -2310,9 +2306,7 @@ long mcctrl_perf_set(ihk_os_t os, struct ihk_perf_event_attr *__user arg)
isp.arg = virt_to_phys(perf_desc);
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp,
msecs_to_jiffies(10000),
wakeup_desc_of_perf_desc(perf_desc),
ret = mcctrl_ikc_send_wait(os, j, &isp, 10000, NULL,
&need_free, 1, perf_desc);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send_wait ret=%d\n",
@ -2382,9 +2376,7 @@ long mcctrl_perf_get(ihk_os_t os, unsigned long *__user arg)
isp.arg = virt_to_phys(perf_desc);
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp,
msecs_to_jiffies(10000),
wakeup_desc_of_perf_desc(perf_desc),
ret = mcctrl_ikc_send_wait(os, j, &isp, 10000, NULL,
&need_free, 1, perf_desc);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send_wait ret=%d\n",
@ -2454,9 +2446,8 @@ long mcctrl_perf_enable(ihk_os_t os)
return -EINVAL;
}
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp, 0,
wakeup_desc_of_perf_desc(perf_desc),
&need_free, 1, perf_desc);
ret = mcctrl_ikc_send_wait(os, j, &isp, 0, NULL,
&need_free, 1, perf_desc);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send_wait ret=%d\n",
@ -2522,8 +2513,7 @@ long mcctrl_perf_disable(ihk_os_t os)
return -EINVAL;
}
for (j = 0; j < info->n_cpus; j++) {
ret = mcctrl_ikc_send_wait(os, j, &isp, 0,
wakeup_desc_of_perf_desc(perf_desc),
ret = mcctrl_ikc_send_wait(os, j, &isp, 0, NULL,
&need_free, 1, perf_desc);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send_wait ret=%d\n",

View File

@ -182,8 +182,6 @@ static int uti_remote_page_fault(struct mcctrl_usrdata *usrdata,
struct mcctrl_per_proc_data *ppd, int tid, int cpu)
{
int error;
struct mcctrl_wakeup_desc *desc;
int do_frees = 1;
struct ikc_scd_packet packet;
/* Request page fault */
@ -192,20 +190,9 @@ static int uti_remote_page_fault(struct mcctrl_usrdata *usrdata,
packet.fault_reason = reason;
packet.fault_tid = tid;
/* we need to alloc desc ourselves because GFP_ATOMIC */
retry_alloc:
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
if (!desc) {
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
goto retry_alloc;
}
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
error = mcctrl_ikc_send_wait(usrdata->os, cpu, &packet,
0, desc, &do_frees, 0);
if (do_frees) {
kfree(desc);
}
0, NULL, NULL, 0);
if (error < 0) {
pr_warn("%s: WARNING: failed to request uti remote page fault :%d\n",
__func__, error);

View File

@ -58,23 +58,41 @@ void mcctrl_os_read_write_cpu_response(ihk_os_t os,
struct ikc_scd_packet *pisp);
void mcctrl_eventfd(ihk_os_t os, struct ikc_scd_packet *pisp);
/* Assumes usrdata->wakeup_descs_lock taken */
static void mcctrl_wakeup_desc_cleanup(ihk_os_t os,
struct mcctrl_wakeup_desc *desc)
static void mcctrl_wakeup_desc_put(struct mcctrl_wakeup_desc *desc,
struct mcctrl_usrdata *usrdata, int free_addrs)
{
unsigned long irqflags;
int i;
list_del(&desc->chain);
for (i = 0; i < desc->free_addrs_count; i++) {
kfree(desc->free_addrs[i]);
if (!refcount_dec_and_test(&desc->count)) {
return;
}
spin_lock_irqsave(&usrdata->wakeup_descs_lock, irqflags);
list_del(&desc->chain);
spin_unlock_irqrestore(&usrdata->wakeup_descs_lock, irqflags);
if (free_addrs) {
for (i = 0; i < desc->free_addrs_count; i++) {
kfree(desc->free_addrs[i]);
}
}
if (desc->free_at_put)
kfree(desc);
}
static void mcctrl_wakeup_cb(ihk_os_t os, struct ikc_scd_packet *packet)
{
struct mcctrl_wakeup_desc *desc = packet->reply;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
/* destroy_ikc_channels must have cleaned up descs */
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n",
__func__);
return;
}
WRITE_ONCE(desc->err, packet->err);
@ -85,29 +103,25 @@ static void mcctrl_wakeup_cb(ihk_os_t os, struct ikc_scd_packet *packet)
* wake up opportunistically between this set and the wake_up call.
*
* If the other side is no longer waiting, free the memory that was
* left for us.
* left for us. The caller has been notified not to free.
*/
if (cmpxchg(&desc->status, 0, 1)) {
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
/* destroy_ikc_channels must have cleaned up descs */
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n",
__func__);
return;
}
spin_lock_irqsave(&usrdata->wakeup_descs_lock, flags);
mcctrl_wakeup_desc_cleanup(os, desc);
spin_unlock_irqrestore(&usrdata->wakeup_descs_lock, flags);
mcctrl_wakeup_desc_put(desc, usrdata, 1);
return;
}
/*
* Notify waiter before dropping reference to make sure
* wait queue is still valid.
*/
wake_up_interruptible(&desc->wq);
mcctrl_wakeup_desc_put(desc, usrdata, 0);
}
/* do_frees: 1 when caller should free free_addrs[], 0 otherwise */
/*
* do_frees: 1 when caller should free free_addrs[], 0 otherwise
* timeout: timeout in milliseconds
*/
int mcctrl_ikc_send_wait(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp,
long int timeout, struct mcctrl_wakeup_desc *desc,
int *do_frees, int free_addrs_count, ...)
@ -115,35 +129,60 @@ int mcctrl_ikc_send_wait(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp,
int ret, i;
int alloc_desc = (desc == NULL);
va_list ap;
unsigned long flags;
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n",
__func__);
return -EINVAL;
}
if (free_addrs_count)
*do_frees = 1;
if (alloc_desc)
desc = kmalloc(sizeof(struct mcctrl_wakeup_desc) +
(free_addrs_count + 1) * sizeof(void *),
GFP_KERNEL);
GFP_ATOMIC);
if (!desc) {
pr_warn("%s: Could not allocate wakeup descriptor", __func__);
return -ENOMEM;
}
pisp->reply = desc;
va_start(ap, free_addrs_count);
for (i = 0; i < free_addrs_count; i++) {
desc->free_addrs[i] = va_arg(ap, void*);
}
va_end(ap);
if (alloc_desc)
desc->free_addrs[free_addrs_count++] = desc;
desc->free_addrs_count = free_addrs_count;
/* Only free at put time if allocated internally */
desc->free_at_put = 0;
if (alloc_desc)
desc->free_at_put = 1;
init_waitqueue_head(&desc->wq);
/* One for the caller and one for the call-back */
refcount_set(&desc->count, 2);
/* XXX: make this a hash-table? */
spin_lock_irqsave(&usrdata->wakeup_descs_lock, flags);
list_add(&desc->chain, &usrdata->wakeup_descs_list);
spin_unlock_irqrestore(&usrdata->wakeup_descs_lock, flags);
WRITE_ONCE(desc->err, 0);
WRITE_ONCE(desc->status, 0);
ret = mcctrl_ikc_send(os, cpu, pisp);
if (ret < 0) {
pr_warn("%s: mcctrl_ikc_send failed: %d\n", __func__, ret);
if (alloc_desc)
kfree(desc);
/* Failed to send msg, put twice */
mcctrl_wakeup_desc_put(desc, usrdata, 0);
mcctrl_wakeup_desc_put(desc, usrdata, 0);
return ret;
}
@ -180,28 +219,16 @@ int mcctrl_ikc_send_wait(ihk_os_t os, int cpu, struct ikc_scd_packet *pisp,
* the callback it will need to free things for us
*/
if (!cmpxchg(&desc->status, 0, 1)) {
struct mcctrl_usrdata *usrdata = ihk_host_os_get_usrdata(os);
unsigned long flags;
mcctrl_wakeup_desc_put(desc, usrdata, 0);
if (!usrdata) {
pr_err("%s: error: mcctrl_usrdata not found\n",
__func__);
ret = ret < 0 ? ret : -EINVAL;
goto out;
}
spin_lock_irqsave(&usrdata->wakeup_descs_lock, flags);
list_add(&desc->chain, &usrdata->wakeup_descs_list);
spin_unlock_irqrestore(&usrdata->wakeup_descs_lock, flags);
if (do_frees)
*do_frees = 0;
return ret < 0 ? ret : -ETIME;
}
ret = READ_ONCE(desc->err);
out:
if (alloc_desc)
kfree(desc);
mcctrl_wakeup_desc_put(desc, usrdata, 0);
return ret;
}
@ -605,10 +632,15 @@ void destroy_ikc_channels(ihk_os_t os)
ihk_ikc_destroy_channel(usrdata->ikc2linux[i]);
}
}
spin_lock_irqsave(&usrdata->wakeup_descs_lock, flags);
list_for_each_entry_safe(mwd_entry, mwd_next,
&usrdata->wakeup_descs_list, chain) {
mcctrl_wakeup_desc_cleanup(os, mwd_entry);
&usrdata->wakeup_descs_list, chain) {
list_del(&mwd_entry->chain);
for (i = 0; i < mwd_entry->free_addrs_count; i++) {
kfree(mwd_entry->free_addrs[i]);
}
}
spin_unlock_irqrestore(&usrdata->wakeup_descs_lock, flags);

View File

@ -0,0 +1,105 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_REFCOUNT_H
#define _LINUX_REFCOUNT_H
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/spinlock_types.h>
struct mutex;
/**
* struct refcount_t - variant of atomic_t specialized for reference counts
* @refs: atomic_t counter field
*
* The counter saturates at UINT_MAX and will not move once
* there. This avoids wrapping the counter and causing 'spurious'
* use-after-free bugs.
*/
typedef struct refcount_struct {
atomic_t refs;
} refcount_t;
#define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), }
/**
* refcount_set - set a refcount's value
* @r: the refcount
* @n: value to which the refcount will be set
*/
static inline void refcount_set(refcount_t *r, unsigned int n)
{
atomic_set(&r->refs, n);
}
/**
* refcount_read - get a refcount's value
* @r: the refcount
*
* Return: the refcount's value
*/
static inline unsigned int refcount_read(const refcount_t *r)
{
return atomic_read(&r->refs);
}
#ifdef CONFIG_REFCOUNT_FULL
extern __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r);
extern void refcount_add(unsigned int i, refcount_t *r);
extern __must_check bool refcount_inc_not_zero(refcount_t *r);
extern void refcount_inc(refcount_t *r);
extern __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r);
extern __must_check bool refcount_dec_and_test(refcount_t *r);
extern void refcount_dec(refcount_t *r);
#else
# ifdef CONFIG_ARCH_HAS_REFCOUNT
# include <asm/refcount.h>
# else
static inline __must_check bool refcount_add_not_zero(unsigned int i, refcount_t *r)
{
return atomic_add_unless(&r->refs, i, 0);
}
static inline void refcount_add(unsigned int i, refcount_t *r)
{
atomic_add(i, &r->refs);
}
static inline __must_check bool refcount_inc_not_zero(refcount_t *r)
{
return atomic_add_unless(&r->refs, 1, 0);
}
static inline void refcount_inc(refcount_t *r)
{
atomic_inc(&r->refs);
}
static inline __must_check bool refcount_sub_and_test(unsigned int i, refcount_t *r)
{
return atomic_sub_and_test(i, &r->refs);
}
static inline __must_check bool refcount_dec_and_test(refcount_t *r)
{
return atomic_dec_and_test(&r->refs);
}
static inline void refcount_dec(refcount_t *r)
{
atomic_dec(&r->refs);
}
# endif /* !CONFIG_ARCH_HAS_REFCOUNT */
#endif /* CONFIG_REFCOUNT_FULL */
extern __must_check bool refcount_dec_if_one(refcount_t *r);
extern __must_check bool refcount_dec_not_one(refcount_t *r);
extern __must_check bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock);
extern __must_check bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock);
extern __must_check bool refcount_dec_and_lock_irqsave(refcount_t *r,
spinlock_t *lock,
unsigned long *flags);
#endif /* _LINUX_REFCOUNT_H */

View File

@ -44,6 +44,10 @@
#include <linux/semaphore.h>
#include <linux/rwlock.h>
#include <linux/threads.h>
#include <linux/version.h>
#if KERNEL_VERSION(4, 11, 0) > LINUX_VERSION_CODE
#include <refcount.h>
#endif
#include "sysfs.h"
#define SCD_MSG_PREPARE_PROCESS 0x1
@ -401,6 +405,8 @@ int mcctrl_ikc_is_valid_thread(ihk_os_t os, int cpu);
struct mcctrl_wakeup_desc {
int status;
int err;
refcount_t count;
int free_at_put;
wait_queue_head_t wq;
struct list_head chain;
int free_addrs_count;

View File

@ -611,7 +611,7 @@ static ssize_t __mckernel_procfs_read_write(
ret = mcctrl_ikc_send_wait(osnum_to_os(e->osnum),
(pid > 0) ? ppd->ikc_target_cpu : 0,
&isp, HZ, NULL, &do_free, 1, r);
&isp, 5000, NULL, &do_free, 1, r);
if (!do_free && ret >= 0) {
ret = -EIO;
@ -879,7 +879,7 @@ static int mckernel_procfs_buff_release(struct inode *inode, struct file *file)
rc = -EIO;
ret = mcctrl_ikc_send_wait(info->os, 0,
&isp, 5 * HZ, NULL, &do_free, 1, r);
&isp, 5000, NULL, &do_free, 1, r);
if (!do_free && ret >= 0) {
ret = -EIO;
@ -977,7 +977,7 @@ static ssize_t mckernel_procfs_buff_read(struct file *file, char __user *ubuf,
done = 1;
ret = mcctrl_ikc_send_wait(os,
(pid > 0) ? ppd->ikc_target_cpu : 0,
&isp, 5 * HZ, NULL, &do_free, 1, r);
&isp, 5000, NULL, &do_free, 1, r);
if (!do_free && ret >= 0) {
ret = -EIO;

View File

@ -495,8 +495,6 @@ int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
struct ikc_scd_packet *packet)
{
int error;
struct mcctrl_wakeup_desc *desc;
int do_frees = 1;
dprintk("%s: tid: %d, fault_addr: %p, reason: %lu\n",
__FUNCTION__, task_pid_vnr(current), fault_addr, (unsigned long)reason);
@ -506,19 +504,9 @@ int remote_page_fault(struct mcctrl_usrdata *usrdata, void *fault_addr,
packet->fault_address = (unsigned long)fault_addr;
packet->fault_reason = reason;
/* we need to alloc desc ourselves because GFP_ATOMIC */
retry_alloc:
desc = kmalloc(sizeof(*desc), GFP_ATOMIC);
if (!desc) {
pr_warn("WARNING: coudln't alloc remote page fault wait desc, retrying..\n");
goto retry_alloc;
}
/* packet->target_cpu was set in rus_vm_fault if a thread was found */
error = mcctrl_ikc_send_wait(usrdata->os, packet->target_cpu, packet,
0, desc, &do_frees, 0);
if (do_frees)
kfree(desc);
0, NULL, NULL, 0);
if (error < 0) {
pr_warn("%s: WARNING: failed to request remote page fault PID %d: %d\n",
__func__, packet->pid, error);

View File

@ -2419,6 +2419,7 @@ int main(int argc, char **argv)
}
#endif // MCEXEC_BIND_MOUNT
/* fget executable as well */
if ((ret = load_elf_desc_shebang(argv[optind], &desc,
&shebang_argv, 1 /* execvp */))) {
fprintf(stderr, "%s: could not load program: %s\n",
@ -2860,6 +2861,14 @@ int main(int argc, char **argv)
fprintf(stderr, "error: transferring image\n");
return -1;
}
/* fput executable */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "error: MCEXEC_UP_CLOSE_EXEC failed with %d\n",
ret);
return 1;
}
fflush(stdout);
fflush(stderr);
@ -3489,7 +3498,7 @@ checkexist_resolvelinks:
if (buf[0] == '/') {
/* cannot snprintf from same source and dest */
n = snprintf(tmpbuf2, PATH_MAX, "%s/%s", buf,
linkpath);
linkpath + 1);
if (n >= PATH_MAX)
return in;
strcpy(tmpbuf, tmpbuf2);
@ -4111,11 +4120,6 @@ int main_loop(struct thread_data_s *my_thread)
It is done by not calling do_syscall_return(fd, cpu, 0, 0, 0, 0, 0);
here and making McKernel side wait until release_handler() is called. */
/* Drop executable file */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "WARNING: close_exec() couldn't find exec file?\n");
}
__dprintf("__NR_exit/__NR_exit_group: %ld (cpu_id: %d)\n",
w.sr.args[0], cpu);
if(w.sr.number == __NR_exit_group){
@ -4308,15 +4312,6 @@ gettid_out:
__dprintf("pid(%d): signals and syscall threads OK\n",
getpid());
/* Hold executable also in the child process */
if ((ret = ioctl(fd, MCEXEC_UP_OPEN_EXEC, exec_path))
!= 0) {
fprintf(stderr, "Error: open_exec() fails for %s: %d (fd: %d)\n",
exec_path, ret, fd);
fs->status = -errno;
goto fork_child_sync_pipe;
}
/* Check if we need to limit number of threads in the pool */
if ((ret = ioctl(fd, MCEXEC_UP_GET_NUM_POOL_THREADS)) < 0) {
fprintf(stderr, "Error: obtaining thread pool count\n");
@ -4472,6 +4467,7 @@ fork_err:
}
filename = pathbuf;
/* fget executable as well */
if ((ret = load_elf_desc_shebang(filename, &desc,
&shebang_argv, 0)) != 0) {
goto return_execve1;
@ -4569,6 +4565,13 @@ return_execve1:
}
__dprintf("%s", "execve(): image transferred\n");
/* fput executable */
if ((ret = ioctl(fd, MCEXEC_UP_CLOSE_EXEC)) != 0) {
fprintf(stderr, "error: MCEXEC_UP_CLOSE_EXEC failed with %d\n",
ret);
return 1;
}
if (close_cloexec_fds(fd) < 0) {
ret = EINVAL;
goto return_execve2;

2
ihk

Submodule ihk updated: 17cd4c9656...8b92b9d7f4

View File

@ -87,7 +87,7 @@ void kputs(char *buf)
debug_spin_unlock_irqrestore(&kmsg_buf->lock, flags_inner);
kprintf_unlock(flags_outer);
if (irqflags_can_interrupt(flags_outer) &&
if (!cpu_interrupt_disabled() &&
DEBUG_KMSG_USED > IHK_KMSG_HIGH_WATER_MARK) {
eventfd(IHK_OS_EVENTFD_TYPE_KMSG);
ihk_mc_delay_us(IHK_KMSG_NOTIFY_DELAY);
@ -128,7 +128,7 @@ int __kprintf(const char *format, ...)
}
debug_spin_unlock_irqrestore(&kmsg_buf->lock, flags_inner);
if (irqflags_can_interrupt(flags_inner) &&
if (!cpu_interrupt_disabled() &&
DEBUG_KMSG_USED > IHK_KMSG_HIGH_WATER_MARK) {
eventfd(IHK_OS_EVENTFD_TYPE_KMSG);
ihk_mc_delay_us(IHK_KMSG_NOTIFY_DELAY);
@ -171,7 +171,7 @@ int kprintf(const char *format, ...)
debug_spin_unlock_irqrestore(&kmsg_buf->lock, flags_inner);
kprintf_unlock(flags_outer);
if (irqflags_can_interrupt(flags_outer) &&
if (!cpu_interrupt_disabled() &&
DEBUG_KMSG_USED > IHK_KMSG_HIGH_WATER_MARK) {
eventfd(IHK_OS_EVENTFD_TYPE_KMSG);
ihk_mc_delay_us(IHK_KMSG_NOTIFY_DELAY);

View File

@ -246,6 +246,12 @@ long do_syscall(struct syscall_request *req, int cpu)
unsigned long flags;
DECLARE_WAITQ_ENTRY(scd_wq_entry, cpu_local_var(current));
#ifdef ENABLE_FUGAKU_HACKS
if (req->number == __NR_epoll_wait ||
req->number == __NR_epoll_pwait)
goto schedule;
#endif
if (thread->rpf_backlog) {
void (*func)(void *) = thread->rpf_backlog;
void *arg = thread->rpf_arg;
@ -287,6 +293,9 @@ long do_syscall(struct syscall_request *req, int cpu)
continue;
}
#ifdef ENABLE_FUGAKU_HACKS
schedule:
#endif
flags = cpu_disable_interrupt_save();
/* Try to sleep until notified */
@ -11149,7 +11158,16 @@ long syscall(int num, ihk_mc_user_context_t *ctx)
}
#endif // PROFILE_ENABLE
if (smp_load_acquire(&v->flags) & CPU_FLAG_NEED_RESCHED) {
#ifdef ENABLE_FUGAKU_HACKS
/* Do not deschedule when returning from an event (e.g., MPI) */
if (!(num == __NR_epoll_wait ||
num == __NR_epoll_pwait ||
num == __NR_ppoll) &&
smp_load_acquire(&v->flags) & CPU_FLAG_NEED_RESCHED)
#else
if (smp_load_acquire(&v->flags) & CPU_FLAG_NEED_RESCHED)
#endif
{
check_need_resched();
}

View File

@ -23,9 +23,7 @@ extern int num_processors;
void cpu_enable_interrupt(void);
void cpu_disable_interrupt(void);
#ifdef ENABLE_FUGAKU_HACKS
int cpu_interrupt_disabled(void);
#endif
void cpu_halt(void);
#ifdef ENABLE_FUGAKU_HACKS
void cpu_halt_panic(void);

View File

@ -15,6 +15,11 @@
%{!?kernel_dir: %global kernel_dir /usr/src/kernels/%{kernel_version}}
%define krequires %(echo %{kernel_version} | sed "s/.%{_target_cpu}$//")
%define ktag %(echo %{krequires} | tr '-' '_' | sed -e 's/\.el[0-9_]*$//' | sed -e 's/\.\([a-zA-Z]\)/_\1/')
%if "@ENABLE_UTI@" == "ON"
%define enable_uti 1
%else
%define enable_uti 0
%endif
Name: mckernel
Version: @MCKERNEL_VERSION@
@ -77,9 +82,10 @@ pushd build
%{?cmake_libdir:-DCMAKE_INSTALL_LIBDIR=%{cmake_libdir}} \
%{?build_target:-DBUILD_TARGET=%{build_target}} \
%{?toolchain_file:-DCMAKE_TOOLCHAIN_FILE=%{toolchain_file}} \
-DENABLE_TOFU=ON -DENABLE_FUGAKU_HACKS=ON \
-DENABLE_KRM_WORKAROUND=OFF -DWITH_KRM=ON \
-DENABLE_FUGAKU_DEBUG=OFF -DENABLE_UTI=ON \
-DENABLE_TOFU=@ENABLE_TOFU@ -DENABLE_FUGAKU_HACKS=@ENABLE_FUGAKU_HACKS@ \
-DENABLE_KRM_WORKAROUND=@ENABLE_KRM_WORKAROUND@ -DWITH_KRM=@WITH_KRM@ \
-DENABLE_FUGAKU_DEBUG=@ENABLE_FUGAKU_DEBUG@ -DENABLE_UTI=@ENABLE_UTI@ \
-DENABLE_FJMPI_WORKAROUND=@ENABLE_FJMPI_WORKAROUND@ \
..
%make_build
popd
@ -112,6 +118,7 @@ popd
%{_libdir}/libsched_yield.so.1.0.0
%{_libdir}/libsched_yield.so
%{_libdir}/libldump2mcdump.so
%if 0%{?enable_uti}
%{_libdir}/libmck_syscall_intercept.so
%{_libdir}/libsyscall_intercept.so.0.1.0
%{_libdir}/libsyscall_intercept.so.0
@ -119,6 +126,7 @@ popd
%{_libdir}/mck/libuti.so.1.0.0
%{_libdir}/mck/libuti.so.1
%{_libdir}/mck/libuti.so
%endif
%{_sysconfdir}/irqbalance_mck.in
%{_mandir}/man1/mcreboot.1.gz
%{_mandir}/man1/ihkconfig.1.gz

58
test/issues/1463/C1463.sh Executable file
View File

@ -0,0 +1,58 @@
#/bin/sh
USELTP=0
USEOSTEST=0
. ../../common.sh
issue="1463"
tid=01
TEST_DIR="/tmp/test"
ABS_PATH="${TEST_DIR}"
REL_PATH="./test"
ABS_LN="${TEST_DIR}_1463_abs_ln"
REL_LN="${TEST_DIR}_1463_rel_ln"
mkdir -p ${TEST_DIR}
touch ${TEST_DIR}/L.dir
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
ln -fns ${ABS_PATH} ${ABS_LN}
mcexec readlink ${ABS_LN}/L.dir | tee ./${tname}.txt
cnt=`grep "a.dir" ./${tname}.txt | wc -l`
if [ ${cnt} -eq 1 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
ln -fns ${REL_PATH} ${REL_LN}
mcexec readlink ${REL_LN}/L.dir | tee ./${tname}.txt
cnt=`grep "a.dir" ./${tname}.txt | wc -l`
if [ ${cnt} -eq 1 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""
tname=`printf "C${issue}T%02d" ${tid}`
echo "*** ${tname} start *******************************"
mcexec cat /sys/devices/system/cpu/offline | tee ./${tname}.txt
echo "** (expected blank output)"
lines=`grep -e "[0-9]" ./${tname}.txt | wc -l`
if [ ${lines} -eq 0 ]; then
echo "*** ${tname} PASSED ******************************"
else
echo "*** ${tname} FAILED ******************************"
fi
let tid++
echo ""

12
test/issues/1463/Makefile Normal file
View File

@ -0,0 +1,12 @@
CFLAGS=
LDFLAGS=
TARGET=
all: $(TARGET)
test: all
sh ./C1463.sh
clean:
rm -f $(TARGET) *.o *.txt

49
test/issues/1463/README Normal file
View File

@ -0,0 +1,49 @@
【Issue#1463 動作確認】
□ テスト内容
Issue#1463の修正は、mcexec.c: mcoverlay_path() 内で行われる/sys/ 配下への
リンク解決処理に関する修正である。
上記のリンク解決処理を/tmp/ 配下にも行うようにするテストパッチを適用した上で
/tmp/ 配下へのmcoverlay_path()を実行して動作を確認する。
なお、リンク解決処理の動作確認には、McKernelによって作成される、
/sys/devices/virtual/mcos/mcos0/sys/test/L.dir を利用する。
このL.dirは、同ディレクトリのa.dir へのシンボリックリンクとなっている。
C1463T01:
以下の流れで、リンク解決処理対象パスの途中に絶対パスのシンボリックリンクが
存在している場合にも、/sys/devices/virtual/mcos/mcos0/sys/ 下に
誘導されることを確認する。
a. /tmp/test/L.dir に空のファイルを作成
b. /tmp/test への絶対パスのシンボリックリンクとして、/tmp/test_1463_abs_ln を作成
c. mcexec readlink /tmp/test_1463_abs_ln/L.dir を実行し、a.dir が出力されることを確認
C1463T02:
以下の流れで、リンク解決処理対象パスの途中に相対パスのシンボリックリンクが
存在している場合にも、/sys/devices/virtual/mcos/mcos0/sys/ 下に
誘導されることを確認する。
a. /tmp/test/L.dir に空のファイルを作成
b. /tmp/test への相対パスのシンボリックリンクとして、/tmp/test_1463_rel_ln を作成
c. mcexec readlink /tmp/test_1463_rel_ln/L.dir を実行し、a.dir が出力されることを確認
C1463T03:
以下の流れで、/sys/ 配下へのアクセスが/sys/devices/virtual/mcos/mcos0/sys/ 下に
誘導されることを確認する。
a. mcexecで確認した場合の /sys/devices/system/cpu/offline が空であることを確認
※通常、mckernelではofflineのCPUが存在しないため
□ 実行手順
・下記の手順でテストを実行する
$ cd <mckernel>
$ patch -p0 < test/issues/1463/tmp_overlay_path.patch
(build mckernel)
$ cd test/issues/1463
$ make test
McKernelのインストール先や、OSTEST, LTPの配置場所は、
$HOME/.mck_test_config を参照している
.mck_test_config は、McKernelをビルドした際に生成されるmck_test_config.sample ファイルを
$HOMEにコピーし、適宜編集する
□ 実行結果
x86_64_result.log aarch64_result.log 参照。
すべての項目をPASSしていることを確認。

View File

@ -0,0 +1,15 @@
sh ./C1463.sh
mcstop+release.sh ... done
mcreboot.sh -c 37-43,49-55 -m 2G@2,2G@3 -r 37-43:36+49-55:48 -O ... done
*** C1463T01 start *******************************
a.dir
*** C1463T01 PASSED ******************************
*** C1463T02 start *******************************
a.dir
*** C1463T02 PASSED ******************************
*** C1463T03 start *******************************
** (expected blank output)
*** C1463T03 PASSED ******************************

View File

@ -0,0 +1,17 @@
diff --git executer/user/mcexec.c executer/user/mcexec.c
index acae1f8..d220dd9 100644
--- executer/user/mcexec.c
+++ executer/user/mcexec.c
@@ -3458,6 +3458,12 @@ overlay_path(int dirfd, const char *in, char *buf, int *resolvelinks)
goto checkexist_resolvelinks;
}
+ /* for #1463's test */
+ if (!strncmp(path, "/tmp", 4) &&
+ (path[4] == '/' || path[4] == '\0')) {
+ goto checkexist_resolvelinks;
+ }
+
return in;
checkexist_resolvelinks:

View File

@ -0,0 +1,15 @@
sh ./C1463.sh
mcstop+release.sh ... done
mcreboot.sh -c 1-7,9-15,17-23,25-31 -m 10G@0,10G@1 -r 1-7:0+9-15:8+17-23:16+25-31:24 -O ... done
*** C1463T01 start *******************************
a.dir
*** C1463T01 PASSED ******************************
*** C1463T02 start *******************************
a.dir
*** C1463T02 PASSED ******************************
*** C1463T03 start *******************************
** (expected blank output)
*** C1463T03 PASSED ******************************

View File

@ -7,7 +7,7 @@ xpmemやshmobjを利用するライブラリの初期化および基本動作を
□実行手順
1.
SSMで以下のとおりサンプルプログラムをビルドする。
ログインノードで以下のとおりサンプルプログラムをビルドする。
必要に応じて PATHを設定すること。
tradモード:
@ -18,7 +18,7 @@ $ mpifccpx -DMPI -DOPENMP -Kopenmp mpi+affinity.c -o mpi+llvmopenmp-affinity
2.
インタラクティブモードでジョブ実行を開始する。
SSMで以下のように pjsubコマンドを発行する。
ログインノードで以下のように pjsubコマンドを発行する。
rscunitやrscgrp、jobenvは環境に応じて指定すること。
$ pjsub --interact -L "rscunit=xxx,rscgrp=xxx,jobenv=xxx,node=1" --mpi "proc=2" --sparam wait-time=1000

View File

@ -54,7 +54,8 @@ sudo ./ctrl 1 1 1 0 0 0 1 1 1
(2) uti_perf
progress threadを用いたtofu get通信速度を測する。
progress threadを用いたtofu get通信速度を測する。
測定結果は 10 ns 単位で出力される。
オプションは以下のとおり。
-a <x>,<y>,<z>,<a>,<b>,<c>
@ -80,7 +81,7 @@ progress threadを用いたtofu get通信速度を計測する。
-l <length>
tofu get通信するデータサイズ(byte)を指定する。
(64 byte 16*1024*1024 - 256 byte、既定値は 16*1024*1024 - 256 byte)
(16 Kbyte 16 Mbyte - 256 byte、既定値は 16 Mbyte - 256 byte)
-v
デバッグ出力を有効にする。
@ -110,3 +111,13 @@ progress threadを用いたtofu get通信速度を計測する。
--recvusleep=<us>
progress threadの受信完了を確認する間隔 (usec) を指定する。 (既定値は0)
例えば、tofu座標が 0,0,0,0,0,0 (受信側) と 0,0,0,1,0,0 (送信側) の間で
24プロセス生成して 16 Kbyte のデータを送信する測定を 100回繰り返す場合は
次のように指定する。
(受信側)
./uti_perf -a 0,0,0,1,0,0 -n 100 -f 24 -l 16384 -r
(送信側)
./uti_perf -a 0,0,0,0,0,0 -n 100 -f 24 -l 16384